43 #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
48 #if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
51 #undef POINT_DATA_ALIGN4
52 #if defined(_M_ARM) && _M_ARM >= 7 && !defined(DISABLE_CPU_NEON)
54 #define CPUPBA_USE_NEON
55 #elif defined(__ARM_NEON) && !defined(DISABLE_CPU_NEON)
57 #define CPUPBA_USE_NEON
59 #elif defined(__AVX__) && !defined(DISABLE_CPU_AVX)
60 #include <immintrin.h>
61 #define CPUPBA_USE_AVX
63 #undef POINT_DATA_ALIGN4
64 #elif (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2)) && !defined(DISABLE_CPU_SSE)
65 #define CPUPBA_USE_SSE
66 #include <xmmintrin.h>
67 #include <emmintrin.h>
70 #ifdef POINT_DATA_ALIGN4
76 #define POINT_ALIGN2 (POINT_ALIGN * 2)
82 #define finite _finite
90 #define THREAD_NUM_MAX 64
92 #define AUTO_MT_NUM(sz) \
93 int((log((double)sz) / log(2.0) - 18.5) * __num_cpu_cores / 16.0)
97 template <
class Float>
100 for (Float* p = _data; p < _last; ++p) out << (*p) <<
'\n';
103 #ifdef CPUPBA_USE_SSE
104 #define CPUPBA_USE_SIMD
106 template <
class Float>
111 typedef __m128 sse_type;
112 static inline sse_type zero() {
return _mm_setzero_ps(); }
117 typedef __m128d sse_type;
118 static inline sse_type zero() {
return _mm_setzero_pd(); }
122 template <
class Float>
123 inline size_t sse_step() {
124 return 16 /
sizeof(Float);
126 inline __m128 sse_load1(
const float* p) {
return _mm_load1_ps(p); }
127 inline __m128 sse_load(
const float* p) {
return _mm_load_ps(p); }
128 inline __m128 sse_add(__m128 s1, __m128 s2) {
return _mm_add_ps(s1, s2); }
129 inline __m128 sse_sub(__m128 s1, __m128 s2) {
return _mm_sub_ps(s1, s2); }
130 inline __m128 sse_mul(__m128 s1, __m128 s2) {
return _mm_mul_ps(s1, s2); }
131 inline __m128 sse_sqrt(__m128 s) {
return _mm_sqrt_ps(s); }
133 inline __m128d sse_load1(
const double* p) {
return _mm_load1_pd(p); }
134 inline __m128d sse_load(
const double* p) {
return _mm_load_pd(p); }
135 inline __m128d sse_add(__m128d s1, __m128d s2) {
return _mm_add_pd(s1, s2); }
136 inline __m128d sse_sub(__m128d s1, __m128d s2) {
return _mm_sub_pd(s1, s2); }
137 inline __m128d sse_mul(__m128d s1, __m128d s2) {
return _mm_mul_pd(s1, s2); }
138 inline __m128d sse_sqrt(__m128d s) {
return _mm_sqrt_pd(s); }
141 inline float sse_sum(__m128 s) {
142 return (s.m128_f32[0] + s.m128_f32[2]) + (s.m128_f32[1] + s.m128_f32[3]);
144 inline double sse_sum(__m128d s) {
return s.m128d_f64[0] + s.m128d_f64[1]; }
146 inline float sse_sum(__m128 s) {
147 float* f = (
float*)(&s);
148 return (f[0] + f[2]) + (f[1] + f[3]);
150 inline double sse_sum(__m128d s) {
151 double* d = (
double*)(&s);
159 inline void sse_store(
float* p, __m128 s) { _mm_store_ps(p, s); }
160 inline void sse_store(
double* p, __m128d s) { _mm_store_pd(p, s); }
162 inline void data_prefetch(
const void* p) {
163 _mm_prefetch((
const char*)p, _MM_HINT_NTA);
167 namespace ProgramCPU {
168 using namespace MYSSE;
169 #define SSE_ZERO SSE<Float>::zero()
170 #define SSE_T typename SSE<Float>::sse_type
172 inline void ScaleJ4(
float* jcx,
float* jcy,
const float* sj) {
173 __m128 ps = _mm_load_ps(sj);
174 _mm_store_ps(jcx, _mm_mul_ps(_mm_load_ps(jcx), ps));
175 _mm_store_ps(jcy, _mm_mul_ps(_mm_load_ps(jcy), ps));
177 inline void ScaleJ8(
float* jcx,
float* jcy,
const float* sj) {
178 ScaleJ4(jcx, jcy, sj);
179 ScaleJ4(jcx + 4, jcy + 4, sj + 4);
181 inline void ScaleJ4(
double* jcx,
double* jcy,
const double* sj) {
182 __m128d ps1 = _mm_load_pd(sj), ps2 = _mm_load_pd(sj + 2);
183 _mm_store_pd(jcx, _mm_mul_pd(_mm_load_pd(jcx), ps1));
184 _mm_store_pd(jcy, _mm_mul_pd(_mm_load_pd(jcy), ps1));
185 _mm_store_pd(jcx + 2, _mm_mul_pd(_mm_load_pd(jcx + 2), ps2));
186 _mm_store_pd(jcy + 2, _mm_mul_pd(_mm_load_pd(jcy + 2), ps2));
188 inline void ScaleJ8(
double* jcx,
double* jcy,
const double* sj) {
189 ScaleJ4(jcx, jcy, sj);
190 ScaleJ4(jcx + 4, jcy + 4, sj + 4);
192 inline float DotProduct8(
const float* v1,
const float* v2) {
193 __m128 ds = _mm_add_ps(_mm_mul_ps(_mm_load_ps(v1), _mm_load_ps(v2)),
194 _mm_mul_ps(_mm_load_ps(v1 + 4), _mm_load_ps(v2 + 4)));
197 inline double DotProduct8(
const double* v1,
const double* v2) {
198 __m128d d1 = _mm_mul_pd(_mm_load_pd(v1), _mm_load_pd(v2));
199 __m128d d2 = _mm_mul_pd(_mm_load_pd(v1 + 2), _mm_load_pd(v2 + 2));
200 __m128d d3 = _mm_mul_pd(_mm_load_pd(v1 + 4), _mm_load_pd(v2 + 4));
201 __m128d d4 = _mm_mul_pd(_mm_load_pd(v1 + 6), _mm_load_pd(v2 + 6));
202 __m128d ds = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4));
206 inline void ComputeTwoJX(
const float* jc,
const float* jp,
const float* xc,
207 const float* xp,
float* jx) {
208 #ifdef POINT_DATA_ALIGN4
209 __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4),
210 mxp = _mm_load_ps(xp);
211 __m128 ds1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc), xc1),
212 _mm_mul_ps(_mm_load_ps(jc + 4), xc2));
213 __m128 dx1 = _mm_add_ps(ds1, _mm_mul_ps(_mm_load_ps(jp), mxp));
214 jx[0] = sse_sum(dx1);
215 __m128 ds2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc + 8), xc1),
216 _mm_mul_ps(_mm_load_ps(jc + 12), xc2));
217 __m128 dx2 = _mm_add_ps(ds2, _mm_mul_ps(_mm_load_ps(jp + 4), mxp));
218 jx[1] = sse_sum(dx2);
220 __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4);
221 __m128 jc1 = _mm_load_ps(jc), jc2 = _mm_load_ps(jc + 4);
222 __m128 jc3 = _mm_load_ps(jc + 8), jc4 = _mm_load_ps(jc + 12);
223 __m128 ds1 = _mm_add_ps(_mm_mul_ps(jc1, xc1), _mm_mul_ps(jc2, xc2));
224 __m128 ds2 = _mm_add_ps(_mm_mul_ps(jc3, xc1), _mm_mul_ps(jc4, xc2));
225 jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
236 inline void ComputeTwoJX(
const double* jc,
const double* jp,
const double* xc,
237 const double* xp,
double* jx) {
238 __m128d xc1 = _mm_load_pd(xc), xc2 = _mm_load_pd(xc + 2),
239 xc3 = _mm_load_pd(xc + 4), xc4 = _mm_load_pd(xc + 6);
240 __m128d d1 = _mm_mul_pd(_mm_load_pd(jc), xc1);
241 __m128d d2 = _mm_mul_pd(_mm_load_pd(jc + 2), xc2);
242 __m128d d3 = _mm_mul_pd(_mm_load_pd(jc + 4), xc3);
243 __m128d d4 = _mm_mul_pd(_mm_load_pd(jc + 6), xc4);
244 __m128d ds1 = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4));
245 jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
247 __m128d d5 = _mm_mul_pd(_mm_load_pd(jc + 8), xc1);
248 __m128d d6 = _mm_mul_pd(_mm_load_pd(jc + 10), xc2);
249 __m128d d7 = _mm_mul_pd(_mm_load_pd(jc + 12), xc3);
250 __m128d d8 = _mm_mul_pd(_mm_load_pd(jc + 14), xc4);
251 __m128d ds2 = _mm_add_pd(_mm_add_pd(d5, d6), _mm_add_pd(d7, d8));
258 inline void AddScaledVec8(
float a,
const float* x,
float* v) {
259 __m128 aa = sse_load1(&a);
260 _mm_store_ps(v, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x), aa), _mm_load_ps(v)));
261 _mm_store_ps(v + 4, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x + 4), aa),
262 _mm_load_ps(v + 4)));
265 inline void AddScaledVec8(
double a,
const double* x,
double* v) {
266 __m128d aa = sse_load1(&a);
267 _mm_store_pd(v, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x), aa), _mm_load_pd(v)));
268 _mm_store_pd(v + 2, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 2), aa),
269 _mm_load_pd(v + 2)));
270 _mm_store_pd(v + 4, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 4), aa),
271 _mm_load_pd(v + 4)));
272 _mm_store_pd(v + 6, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 6), aa),
273 _mm_load_pd(v + 6)));
276 inline void AddBlockJtJ(
const float* jc,
float* block,
int vn) {
277 __m128 j1 = _mm_load_ps(jc);
278 __m128 j2 = _mm_load_ps(jc + 4);
279 for (
int i = 0; i < vn; ++i, ++jc, block += 8) {
280 __m128 a = sse_load1(jc);
281 _mm_store_ps(block + 0,
282 _mm_add_ps(_mm_mul_ps(a, j1), _mm_load_ps(block + 0)));
283 _mm_store_ps(block + 4,
284 _mm_add_ps(_mm_mul_ps(a, j2), _mm_load_ps(block + 4)));
288 inline void AddBlockJtJ(
const double* jc,
double* block,
int vn) {
289 __m128d j1 = _mm_load_pd(jc);
290 __m128d j2 = _mm_load_pd(jc + 2);
291 __m128d j3 = _mm_load_pd(jc + 4);
292 __m128d j4 = _mm_load_pd(jc + 6);
293 for (
int i = 0; i < vn; ++i, ++jc, block += 8) {
294 __m128d a = sse_load1(jc);
295 _mm_store_pd(block + 0,
296 _mm_add_pd(_mm_mul_pd(a, j1), _mm_load_pd(block + 0)));
297 _mm_store_pd(block + 2,
298 _mm_add_pd(_mm_mul_pd(a, j2), _mm_load_pd(block + 2)));
299 _mm_store_pd(block + 4,
300 _mm_add_pd(_mm_mul_pd(a, j3), _mm_load_pd(block + 4)));
301 _mm_store_pd(block + 6,
302 _mm_add_pd(_mm_mul_pd(a, j4), _mm_load_pd(block + 6)));
308 #ifdef CPUPBA_USE_AVX
309 #define CPUPBA_USE_SIMD
311 template <
class Float>
316 typedef __m256 sse_type;
317 static inline sse_type zero() {
return _mm256_setzero_ps(); }
322 typedef __m256d sse_type;
323 static inline sse_type zero() {
return _mm256_setzero_pd(); }
327 template <
class Float>
328 inline size_t sse_step() {
329 return 32 /
sizeof(Float);
331 inline __m256 sse_load1(
const float* p) {
return _mm256_broadcast_ss(p); }
332 inline __m256 sse_load(
const float* p) {
return _mm256_load_ps(p); }
333 inline __m256 sse_add(__m256 s1, __m256 s2) {
return _mm256_add_ps(s1, s2); }
334 inline __m256 sse_sub(__m256 s1, __m256 s2) {
return _mm256_sub_ps(s1, s2); }
335 inline __m256 sse_mul(__m256 s1, __m256 s2) {
return _mm256_mul_ps(s1, s2); }
336 inline __m256 sse_sqrt(__m256 s) {
return _mm256_sqrt_ps(s); }
341 inline __m256d sse_load1(
const double* p) {
return _mm256_broadcast_sd(p); }
342 inline __m256d sse_load(
const double* p) {
return _mm256_load_pd(p); }
343 inline __m256d sse_add(__m256d s1, __m256d s2) {
return _mm256_add_pd(s1, s2); }
344 inline __m256d sse_sub(__m256d s1, __m256d s2) {
return _mm256_sub_pd(s1, s2); }
345 inline __m256d sse_mul(__m256d s1, __m256d s2) {
return _mm256_mul_pd(s1, s2); }
346 inline __m256d sse_sqrt(__m256d s) {
return _mm256_sqrt_pd(s); }
349 inline float sse_sum(__m256 s) {
350 return ((s.m256_f32[0] + s.m256_f32[4]) + (s.m256_f32[2] + s.m256_f32[6])) +
351 ((s.m256_f32[1] + s.m256_f32[5]) + (s.m256_f32[3] + s.m256_f32[7]));
353 inline double sse_sum(__m256d s) {
354 return (s.m256d_f64[0] + s.m256d_f64[2]) + (s.m256d_f64[1] + s.m256d_f64[3]);
357 inline float sse_sum(__m256 s) {
358 float* f = (
float*)(&s);
359 return ((f[0] + f[4]) + (f[2] + f[6])) + ((f[1] + f[5]) + (f[3] + f[7]));
361 inline double sse_sum(__m256d s) {
362 double* d = (
double*)(&s);
363 return (d[0] + d[2]) + (d[1] + d[3]);
366 inline float sse_dot(__m256 s1, __m256 s2) {
367 __m256 temp = _mm256_dp_ps(s1, s2, 0xf1);
368 float* f = (
float*)(&temp);
371 inline double sse_dot(__m256d s1, __m256d s2) {
372 return sse_sum(sse_mul(s1, s2));
375 inline void sse_store(
float* p, __m256 s) { _mm256_store_ps(p, s); }
376 inline void sse_store(
double* p, __m256d s) { _mm256_store_pd(p, s); }
378 inline void data_prefetch(
const void* p) {
379 _mm_prefetch((
const char*)p, _MM_HINT_NTA);
383 namespace ProgramCPU {
384 using namespace MYAVX;
385 #define SSE_ZERO SSE<Float>::zero()
386 #define SSE_T typename SSE<Float>::sse_type
389 inline void ScaleJ8(
float* jcx,
float* jcy,
const float* sj) {
390 __m256 ps = _mm256_load_ps(sj);
391 _mm256_store_ps(jcx, _mm256_mul_ps(_mm256_load_ps(jcx), ps));
392 _mm256_store_ps(jcy, _mm256_mul_ps(_mm256_load_ps(jcy), ps));
394 inline void ScaleJ4(
double* jcx,
double* jcy,
const double* sj) {
395 __m256d ps = _mm256_load_pd(sj);
396 _mm256_store_pd(jcx, _mm256_mul_pd(_mm256_load_pd(jcx), ps));
397 _mm256_store_pd(jcy, _mm256_mul_pd(_mm256_load_pd(jcy), ps));
399 inline void ScaleJ8(
double* jcx,
double* jcy,
const double* sj) {
400 ScaleJ4(jcx, jcy, sj);
401 ScaleJ4(jcx + 4, jcy + 4, sj + 4);
403 inline float DotProduct8(
const float* v1,
const float* v2) {
404 return sse_dot(_mm256_load_ps(v1), _mm256_load_ps(v2));
406 inline double DotProduct8(
const double* v1,
const double* v2) {
407 __m256d ds = _mm256_add_pd(
408 _mm256_mul_pd(_mm256_load_pd(v1), _mm256_load_pd(v2)),
409 _mm256_mul_pd(_mm256_load_pd(v1 + 4), _mm256_load_pd(v2 + 4)));
413 inline void ComputeTwoJX(
const float* jc,
const float* jp,
const float* xc,
414 const float* xp,
float* jx) {
415 __m256 xcm = _mm256_load_ps(xc), jc1 = _mm256_load_ps(jc),
416 jc2 = _mm256_load_ps(jc + 8);
417 jx[0] = sse_dot(jc1, xcm) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
418 jx[1] = sse_dot(jc2, xcm) +
423 inline void ComputeTwoJX(
const double* jc,
const double* jp,
const double* xc,
424 const double* xp,
double* jx) {
425 __m256d xc1 = _mm256_load_pd(xc), xc2 = _mm256_load_pd(xc + 4);
426 __m256d jc1 = _mm256_load_pd(jc), jc2 = _mm256_load_pd(jc + 4);
427 __m256d jc3 = _mm256_load_pd(jc + 8), jc4 = _mm256_load_pd(jc + 12);
428 __m256d ds1 = _mm256_add_pd(_mm256_mul_pd(jc1, xc1), _mm256_mul_pd(jc2, xc2));
429 __m256d ds2 = _mm256_add_pd(_mm256_mul_pd(jc3, xc1), _mm256_mul_pd(jc4, xc2));
430 jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
437 inline void AddScaledVec8(
float a,
const float* x,
float* v) {
438 __m256 aa = sse_load1(&a);
439 _mm256_store_ps(v, _mm256_add_ps(_mm256_mul_ps(_mm256_load_ps(x), aa),
445 inline void AddScaledVec8(
double a,
const double* x,
double* v) {
446 __m256d aa = sse_load1(&a);
447 _mm256_store_pd(v, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x), aa),
449 _mm256_store_pd(v + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x + 4), aa),
450 _mm256_load_pd(v + 4)));
453 inline void AddBlockJtJ(
const float* jc,
float* block,
int vn) {
454 __m256 j = _mm256_load_ps(jc);
455 for (
int i = 0; i < vn; ++i, ++jc, block += 8) {
456 __m256 a = sse_load1(jc);
457 _mm256_store_ps(block,
458 _mm256_add_ps(_mm256_mul_ps(a, j), _mm256_load_ps(block)));
462 inline void AddBlockJtJ(
const double* jc,
double* block,
int vn) {
463 __m256d j1 = _mm256_load_pd(jc);
464 __m256d j2 = _mm256_load_pd(jc + 4);
465 for (
int i = 0; i < vn; ++i, ++jc, block += 8) {
466 __m256d a = sse_load1(jc);
467 _mm256_store_pd(block + 0, _mm256_add_pd(_mm256_mul_pd(a, j1),
468 _mm256_load_pd(block + 0)));
469 _mm256_store_pd(block + 4, _mm256_add_pd(_mm256_mul_pd(a, j2),
470 _mm256_load_pd(block + 4)));
477 #ifdef CPUPBA_USE_NEON
478 #define CPUPBA_USE_SIMD
480 #define SIMD_NO_DOUBLE
482 template <
class Float>
487 typedef float32x4_t sse_type;
491 template <
class Float>
492 inline size_t sse_step() {
493 return 16 /
sizeof(Float);
495 inline float32x4_t sse_load1(
const float* p) {
return vld1q_dup_f32(p); }
496 inline float32x4_t sse_load(
const float* p) {
return vld1q_f32(p); }
497 inline float32x4_t sse_loadzero() {
499 return sse_load1(&z);
501 inline float32x4_t sse_add(float32x4_t s1, float32x4_t s2) {
502 return vaddq_f32(s1, s2);
504 inline float32x4_t sse_sub(float32x4_t s1, float32x4_t s2) {
505 return vsubq_f32(s1, s2);
507 inline float32x4_t sse_mul(float32x4_t s1, float32x4_t s2) {
508 return vmulq_f32(s1, s2);
512 inline float sse_sum(float32x4_t s) {
513 float* f = (
float*)(&s);
514 return (f[0] + f[2]) + (f[1] + f[3]);
516 inline void sse_store(
float* p, float32x4_t s) { vst1q_f32(p, s); }
517 inline void data_prefetch(
const void* p) {}
519 namespace ProgramCPU {
520 using namespace MYNEON;
521 #define SSE_ZERO sse_loadzero()
522 #define SSE_T typename SSE<Float>::sse_type
524 inline void ScaleJ4(
float* jcx,
float* jcy,
const float* sj) {
525 float32x4_t ps = sse_load(sj);
526 sse_store(jcx, sse_mul(sse_load(jcx), ps));
527 sse_store(jcy, sse_mul(sse_load(jcy), ps));
529 inline void ScaleJ8(
float* jcx,
float* jcy,
const float* sj) {
530 ScaleJ4(jcx, jcy, sj);
531 ScaleJ4(jcx + 4, jcy + 4, sj + 4);
534 inline float DotProduct8(
const float* v1,
const float* v2) {
535 float32x4_t ds = sse_add(sse_mul(sse_load(v1), sse_load(v2)),
536 sse_mul(sse_load(v1 + 4), sse_load(v2 + 4)));
540 inline void ComputeTwoJX(
const float* jc,
const float* jp,
const float* xc,
541 const float* xp,
float* jx) {
542 #ifdef POINT_DATA_ALIGN4
543 float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4), mxp = sse_load(xp);
545 sse_add(sse_mul(sse_load(jc), xc1), sse_mul(sse_load(jc + 4), xc2));
546 float32x4_t dx1 = sse_add(ds1, sse_mul(sse_load(jp), mxp));
547 jx[0] = sse_sum(dx1);
549 sse_add(sse_mul(sse_load(jc + 8), xc1), sse_mul(sse_load(jc + 12), xc2));
550 float32x4_t dx2 = sse_add(ds2, sse_mul(sse_load(jp + 4), mxp));
551 jx[1] = sse_sum(dx2);
553 float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4);
554 float32x4_t jc1 = sse_load(jc), jc2 = sse_load(jc + 4);
555 float32x4_t jc3 = sse_load(jc + 8), jc4 = sse_load(jc + 12);
556 float32x4_t ds1 = sse_add(sse_mul(jc1, xc1), sse_mul(jc2, xc2));
557 float32x4_t ds2 = sse_add(sse_mul(jc3, xc1), sse_mul(jc4, xc2));
558 jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
570 inline void AddScaledVec8(
float a,
const float* x,
float* v) {
571 float32x4_t aa = sse_load1(&a);
572 sse_store(v, sse_add(sse_mul(sse_load(x), aa), sse_load(v)));
573 sse_store(v + 4, sse_add(sse_mul(sse_load(x + 4), aa), sse_load(v + 4)));
576 inline void AddBlockJtJ(
const float* jc,
float* block,
int vn) {
577 float32x4_t j1 = sse_load(jc);
578 float32x4_t j2 = sse_load(jc + 4);
579 for (
int i = 0; i < vn; ++i, ++jc, block += 8) {
580 float32x4_t a = sse_load1(jc);
581 sse_store(block + 0, sse_add(sse_mul(a, j1), sse_load(block + 0)));
582 sse_store(block + 4, sse_add(sse_mul(a, j2), sse_load(block + 4)));
588 namespace ProgramCPU {
590 template <
class Float>
593 #if defined(CPUPBA_USE_SIMD)
594 template <
class Float>
597 const size_t step = sse_step<Float>();
598 Float *p = &vec[0], *pe = p + vec.
size(), *pex = pe - step;
599 for (; p <= pex; p += step) sse_store(p, sse_sqrt(sse_load(p)));
600 for (; p < pe; ++p) p[0] = sqrt(p[0]);
602 for (Float* it = vec.
begin(); it < vec.
end(); ++it) *it = sqrt(*it);
606 template <
class Float>
608 Float *p = &vec[0], *pe = p + vec.
size();
609 for (; p < pe; ++p) p[0] = (p[0] == 0 ? 0 : Float(1.0) / p[0]);
613 template <
class Float>
615 SSE_T sse = SSE_ZERO;
616 const size_t step = sse_step<Float>();
617 Float* pex = pe - step;
618 for (; p <= pex; p += step) sse_store(p, sse);
619 for (; p < pe; ++p) *p = 0;
622 template <
class Float>
624 Float *p = &vec[0], *pe = p + vec.size();
629 template <
class Float>
630 inline void MemoryCopyA(
const Float* p,
const Float* pe, Float* d) {
631 const size_t step = sse_step<Float>();
632 const Float* pex = pe - step;
633 for (; p <= pex; p += step, d += step) sse_store(d, sse_load(p));
637 template <
class Float>
639 SSE_T sse = SSE_ZERO;
640 const size_t step = sse_step<Float>();
641 const Float* pex = pe - step;
642 for (; p <= pex; p += step) {
643 SSE_T ps = sse_load(p);
644 sse = sse_add(sse, sse_mul(ps, ps));
646 double sum = sse_sum(sse);
647 for (; p < pe; ++p) sum += p[0] * p[0];
651 template <
class Float>
653 if (weight.begin() !=
NULL) {
654 SSE_T sse = SSE_ZERO;
655 const size_t step = sse_step<Float>();
656 const Float *p = vec, *pe = p + vec.size(), *pex = pe - step;
657 const Float* w = weight;
658 for (; p <= pex; p += step, w += step) {
659 SSE_T pw = sse_load(w), ps = sse_load(p);
660 sse = sse_add(sse, sse_mul(sse_mul(ps, pw), ps));
662 double sum = sse_sum(sse);
663 for (; p < pe; ++p, ++w) sum += p[0] * w[0] * p[0];
666 return ComputeVectorNorm<Float>(vec, 0);
670 template <
class Float>
672 SSE_T sse = SSE_ZERO;
673 const size_t step = sse_step<Float>();
674 const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step;
675 const Float* p2 = vec2;
676 for (; p1 <= pex; p1 += step, p2 += step) {
677 SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
678 sse = sse_add(sse, sse_mul(ps1, ps2));
680 double sum = sse_sum(sse);
681 for (; p1 < pe; ++p1, ++p2) sum += p1[0] * p2[0];
685 template <
class Float>
686 void ComputeVXY(
const avec<Float>& vec1,
const avec<Float>& vec2,
687 avec<Float>&
result,
size_t part = 0,
size_t skip = 0) {
688 const size_t step = sse_step<Float>();
689 const Float *p1 = vec1 + skip, *pe = p1 + (part ? part : vec1.size()),
691 const Float* p2 = vec2 + skip;
692 Float* p3 =
result + skip;
693 for (; p1 <= pex; p1 += step, p2 += step, p3 += step) {
694 SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
695 sse_store(p3, sse_mul(ps1, ps2));
697 for (; p1 < pe; ++p1, ++p2, ++p3) *p3 = p1[0] * p2[0];
700 template <
class Float>
701 void ComputeSAXPY(Float a,
const Float* p1,
const Float* p2, Float* p3,
703 const size_t step = sse_step<Float>();
704 SSE_T aa = sse_load1(&a);
705 Float* pex = pe - step;
707 for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
708 SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
709 sse_store(p3, sse_add(ps2, ps1));
711 }
else if (a == -1.0f) {
712 for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
713 SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
714 sse_store(p3, sse_sub(ps2, ps1));
717 for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
718 SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
719 sse_store(p3, sse_add(ps2, sse_mul(ps1, aa)));
722 for (; p3 < pe; ++p1, ++p2, ++p3) p3[0] = a * p1[0] + p2[0];
725 template <
class Float>
727 const size_t step = sse_step<Float>();
728 SSE_T aa = sse_load1(&a);
729 const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step;
731 for (; p1 <= pex; p1 += step, p3 += step) {
732 sse_store(p3, sse_mul(sse_load(p1), aa));
734 for (; p1 < pe; ++p1, ++p3) p3[0] = a * p1[0];
737 template <
class Float>
738 inline void ComputeSXYPZ(Float a,
const Float* p1,
const Float* p2,
739 const Float* p3, Float* p4, Float* pe) {
740 const size_t step = sse_step<Float>();
741 SSE_T aa = sse_load1(&a);
742 Float* pex = pe - step;
743 for (; p4 <= pex; p1 += step, p2 += step, p3 += step, p4 += step) {
744 SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2), ps3 = sse_load(p3);
745 sse_store(p4, sse_add(ps3, sse_mul(sse_mul(ps1, aa), ps2)));
747 for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) p4[0] = a * p1[0] * p2[0] + p3[0];
751 template <
class Float>
753 Float* it = vec.
begin();
754 for (; it < vec.
end(); ++it) {
758 template <
class Float>
760 Float* it = vec.
begin();
761 for (; it < vec.
end(); ++it) {
762 *it = (*it == 0 ? 0 : Float(1.0) / sqrt(*it));
765 template <
class Float>
769 template <
class Float>
771 std::fill(vec.
begin(), vec.
end(), 0);
774 template <
class Float>
775 inline void MemoryCopyA(
const Float* p,
const Float* pe, Float* d) {
776 while (p < pe) *d++ = *p++;
779 template <
class Float>
782 const Float *it1 = vec.
begin(), *it2 = weight.
begin();
783 for (; it1 < vec.
end(); ++it1, ++it2) {
784 sum += (*it1) * (*it2) * (*it1);
789 template <
class Float>
792 const Float *it1 = vec1.
begin(), *it2 = vec2.
begin();
793 for (; it1 < vec1.
end(); ++it1, ++it2) {
794 sum += (*it1) * (*it2);
798 template <
class Float>
801 for (; p < pe; ++p) sum += (*p) * (*p);
804 template <
class Float>
807 const Float *it1 = vec1.
begin() + skip, *it2 = vec2.
begin() + skip;
808 const Float* ite = part ? (it1 + part) : vec1.
end();
809 Float* it3 =
result.begin() + skip;
810 for (; it1 < ite; ++it1, ++it2, ++it3) {
811 (*it3) = (*it1) * (*it2);
814 template <
class Float>
815 void ScaleJ8(Float* jcx, Float* jcy,
const Float* sj) {
816 for (
int i = 0; i < 8; ++i) {
822 template <
class Float>
824 for (
int i = 0; i < 8; ++i) v[i] += (a * x[i]);
827 template <
class Float>
829 const Float* it1 = vec1.
begin();
830 Float* it3 =
result.begin();
831 for (; it1 < vec1.
end(); ++it1, ++it3) {
832 (*it3) = (a * (*it1));
836 template <
class Float>
838 const Float* p3, Float* p4, Float* pe) {
839 for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) *p4 = (a * (*p1) * (*p2) + (*p3));
842 template <
class Float>
843 void ComputeSAXPY(Float a,
const Float* it1,
const Float* it2, Float* it3,
845 if (a == (Float)1.0) {
846 for (; it3 < ite; ++it1, ++it2, ++it3) {
847 (*it3) = ((*it1) + (*it2));
850 for (; it3 < ite; ++it1, ++it2, ++it3) {
851 (*it3) = (a * (*it1) + (*it2));
855 template <
class Float>
857 for (
int i = 0; i < vn; ++i) {
858 Float *row = block + i * 8, a = jc[i];
859 for (
int j = 0; j < vn; ++j) row[j] += a * jc[j];
865 #define DEFINE_THREAD_DATA(X) \
866 template <class Float> \
868 #define DECLEAR_THREAD_DATA(X, ...) \
869 X##_STRUCT<Float> tdata = {__VA_ARGS__}; \
870 X##_STRUCT<Float>* newdata = new X##_STRUCT<Float>(tdata)
871 #define BEGIN_THREAD_PROC(X) \
874 template <class Float> \
875 DWORD X##_PROC(X##_STRUCT<Float>* q) {
876 #define END_THREAD_RPOC(X) \
881 #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
882 #define MYTHREAD std::thread
883 #define RUN_THREAD(X, t, ...) \
884 DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
885 t = std::thread(X##_PROC<Float>, newdata)
886 #define WAIT_THREAD(tv, n) \
888 for (size_t i = 0; i < size_t(n); ++i) tv[i].join(); \
891 #define MYTHREAD HANDLE
892 #define RUN_THREAD(X, t, ...) \
893 DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
894 t = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)X##_PROC<Float>, newdata, \
896 #define WAIT_THREAD(tv, n) \
898 WaitForMultipleObjects((DWORD)n, tv, TRUE, INFINITE); \
899 for (size_t i = 0; i < size_t(n); ++i) CloseHandle(tv[i]); \
903 #define DEFINE_THREAD_DATA(X) \
904 template <class Float> \
905 struct X##_STRUCT { \
907 #define DECLEAR_THREAD_DATA(X, ...) \
908 X##_STRUCT<Float> tdata = {i, __VA_ARGS__}; \
909 X##_STRUCT<Float>* newdata = new X##_STRUCT<Float>(tdata)
910 #define BEGIN_THREAD_PROC(X) \
913 template <class Float> \
914 void* X##_PROC(X##_STRUCT<Float>* q) {
921 #define END_THREAD_RPOC(X) \
925 template <class Float> \
926 struct X##_FUNCTOR { \
927 typedef void* (*func_type)(X##_STRUCT<Float>*); \
928 static func_type get() { return &(X##_PROC<Float>); } \
930 #define MYTHREAD pthread_t
932 #define RUN_THREAD(X, t, ...) \
933 DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
934 pthread_create(&t, NULL, (void* (*)(void*))X##_FUNCTOR<Float>::get(), newdata)
935 #define WAIT_THREAD(tv, n) \
937 for (size_t i = 0; i < size_t(n); ++i) pthread_join(tv[i], NULL); \
940 template <
class Float>
941 inline void MemoryCopyB(
const Float* p,
const Float* pe, Float* d) {
942 while (p < pe) *d++ = *p++;
945 template <
class Float>
947 return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2] + v1[3] * v2[3] +
948 v1[4] * v2[4] + v1[5] * v2[5] + v1[6] * v2[6] + v1[7] * v2[7];
950 template <
class Float>
951 inline void ComputeTwoJX(
const Float* jc,
const Float* jp,
const Float* xc,
952 const Float* xp, Float* jx) {
953 jx[0] =
DotProduct8(jc, xc) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
955 DotProduct8(jc + 8, xc) + (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
957 template <
class Float>
960 const Float* it = vec.
begin();
961 for (; it < vec.
end(); ++it) {
962 Float vi = (Float)
fabs(*it);
968 template <
class Float>
972 const Float *p1 = &vec1[0], *p2 = &vec2[0], *p3 = &vec3[0];
985 const Float *p1, *p2;
991 template <class Float>
994 const bool auto_multi_thread =
true;
995 if (auto_multi_thread && mt == 0) {
998 if (mt > 1 &&
result.size() >= mt * 4) {
1001 const Float *p1 = vec1.begin(), *p2 = vec2.begin();
1002 Float* p3 =
result.begin();
1003 for (
size_t i = 0; i < thread_num; ++i) {
1010 p3 + first, p3 + last);
1019 const Float *p, *pe;
1025 template <class Float>
1027 const bool auto_multi_thread =
true;
1028 if (auto_multi_thread && mt == 0) {
1031 if (mt > 1 && vec.size() >= mt * 4) {
1035 const Float* p = vec;
1036 for (
size_t i = 0; i < thread_num; ++i) {
1037 size_t first = (vec.size() * i / thread_num +
FLOAT_ALIGN - 1) /
1039 size_t last_ = (vec.size() * (i + 1) / thread_num +
FLOAT_ALIGN - 1) /
1041 size_t last =
std::min(last_, vec.size());
1046 for (
size_t i = 0; i < thread_num; ++i) sum += sumv[i];
1055 template <
class Float>
1058 double a = (m[0][0] + m[1][1] + m[2][2] - 1.0) / 2.0;
1059 const double epsilon = 0.01;
1060 if (
fabs(m[0][1] - m[1][0]) < epsilon &&
fabs(m[1][2] - m[2][1]) < epsilon &&
1061 fabs(m[0][2] - m[2][0]) < epsilon) {
1062 if (
fabs(m[0][1] + m[1][0]) < 0.1 &&
fabs(m[1][2] + m[2][1]) < 0.1 &&
1063 fabs(m[0][2] + m[2][0]) < 0.1 && a > 0.9) {
1068 const Float ha = Float(sqrt(0.5) * 3.14159265358979323846);
1069 double xx = (m[0][0] + 1.0) / 2.0;
1070 double yy = (m[1][1] + 1.0) / 2.0;
1071 double zz = (m[2][2] + 1.0) / 2.0;
1072 double xy = (m[0][1] + m[1][0]) / 4.0;
1073 double xz = (m[0][2] + m[2][0]) / 4.0;
1074 double yz = (m[1][2] + m[2][1]) / 4.0;
1076 if ((xx > yy) && (xx > zz)) {
1081 double t = sqrt(xx);
1082 r[0] = Float(t * 3.14159265358979323846);
1083 r[1] = Float(xy / t * 3.14159265358979323846);
1084 r[2] = Float(xz / t * 3.14159265358979323846);
1086 }
else if (yy > zz) {
1091 double t = sqrt(yy);
1092 r[0] = Float(xy / t * 3.14159265358979323846);
1093 r[1] = Float(t * 3.14159265358979323846);
1094 r[2] = Float(yz / t * 3.14159265358979323846);
1101 double t = sqrt(zz);
1102 r[0] = Float(xz / t * 3.14159265358979323846);
1103 r[1] = Float(yz / t * 3.14159265358979323846);
1104 r[2] = Float(t * 3.14159265358979323846);
1110 double b = 0.5 * a / sin(a);
1111 r[0] = Float(b * (m[2][1] - m[1][2]));
1112 r[1] = Float(b * (m[0][2] - m[2][0]));
1113 r[2] = Float(b * (m[1][0] - m[0][1]));
1116 template <
class Float>
1118 double a = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]);
1119 double ct = a == 0.0 ? 0.5f : (1.0f - cos(a)) / a / a;
1120 double st = a == 0.0 ? 1 : sin(a) / a;
1121 m[0] = Float(1.0 - (r[1] * r[1] + r[2] * r[2]) * ct);
1122 m[1] = Float(r[0] * r[1] * ct - r[2] * st);
1123 m[2] = Float(r[2] * r[0] * ct + r[1] * st);
1124 m[3] = Float(r[0] * r[1] * ct + r[2] * st);
1125 m[4] = Float(1.0f - (r[2] * r[2] + r[0] * r[0]) * ct);
1126 m[5] = Float(r[1] * r[2] * ct - r[0] * st);
1127 m[6] = Float(r[2] * r[0] * ct - r[1] * st);
1128 m[7] = Float(r[1] * r[2] * ct + r[0] * st);
1129 m[8] = Float(1.0 - (r[0] * r[0] + r[1] * r[1]) * ct);
1131 template <
class Float>
1134 const Float *c = &camera[0], *d = &delta[0];
1135 Float *nc = &new_camera[0], m[9];
1137 for (
int i = 0; i < ncam; ++i, c += 16, d += 8, nc += 16) {
1138 nc[0] =
max(c[0] + d[0], ((Float)1e-10));
1139 nc[1] = c[1] + d[1];
1140 nc[2] = c[2] + d[2];
1141 nc[3] = c[3] + d[3];
1142 nc[13] = c[13] + d[7];
1146 nc[4] = m[0] * c[4 + 0] + m[1] * c[4 + 3] + m[2] * c[4 + 6];
1147 nc[5] = m[0] * c[4 + 1] + m[1] * c[4 + 4] + m[2] * c[4 + 7];
1148 nc[6] = m[0] * c[4 + 2] + m[1] * c[4 + 5] + m[2] * c[4 + 8];
1149 nc[7] = m[3] * c[4 + 0] + m[4] * c[4 + 3] + m[5] * c[4 + 6];
1150 nc[8] = m[3] * c[4 + 1] + m[4] * c[4 + 4] + m[5] * c[4 + 7];
1151 nc[9] = m[3] * c[4 + 2] + m[4] * c[4 + 5] + m[5] * c[4 + 8];
1152 nc[10] = m[6] * c[4 + 0] + m[7] * c[4 + 3] + m[8] * c[4 + 6];
1153 nc[11] = m[6] * c[4 + 1] + m[7] * c[4 + 4] + m[8] * c[4 + 7];
1154 nc[12] = m[6] * c[4 + 2] + m[7] * c[4 + 5] + m[8] * c[4 + 8];
1164 template <
class Float>
1181 template <
class Float>
1183 const Float* ms,
const int* jmap, Float* pj,
int radial,
1191 int radial_distortion;
1194 q->radial_distortion, 0);
1197 template <class Float>
1199 const Float* ms, const
int* jmap, Float* pj,
int radial,
1201 if (mt > 1 && nproj >= mt) {
1204 for (
size_t i = 0; i < thread_num; ++i) {
1205 size_t first = nproj * i / thread_num;
1206 size_t last_ = nproj * (i + 1) / thread_num;
1207 size_t last =
std::min(last_, nproj);
1209 ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial);
1214 for (
size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) {
1215 const Float* c = camera + jmap[0] * 16;
1218 Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1];
1219 Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2];
1220 Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3];
1223 Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2);
1224 Float f_p2 = c[0] * rr / p2;
1225 pj[0] = ms[0] - p0 * f_p2;
1226 pj[1] = ms[1] - p1 * f_p2;
1227 }
else if (radial == -1) {
1228 Float f_p2 = c[0] / p2;
1229 Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]);
1230 pj[0] = ms[0] * rd - p0 * f_p2;
1231 pj[1] = ms[1] * rd - p1 * f_p2;
1233 pj[0] = ms[0] - p0 * c[0] / p2;
1234 pj[1] = ms[1] - p1 * c[0] / p2;
1240 template <
class Float>
1242 const Float* ms,
const int* jmap, Float* pj,
int radial,
1250 int radial_distortion;
1253 q->radial_distortion, 0);
1256 template <class Float>
1258 const Float* ms, const
int* jmap, Float* pj,
int radial,
1260 if (mt > 1 && nproj >= mt) {
1263 for (
size_t i = 0; i < thread_num; ++i) {
1264 size_t first = nproj * i / thread_num;
1265 size_t last_ = nproj * (i + 1) / thread_num;
1266 size_t last =
std::min(last_, nproj);
1268 ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial);
1272 for (
size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) {
1273 const Float* c = camera + jmap[0] * 16;
1276 Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1];
1277 Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2];
1278 Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3];
1280 Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2);
1281 Float f_p2 = c[0] / p2;
1282 pj[0] = ms[0] / rr - p0 * f_p2;
1283 pj[1] = ms[1] / rr - p1 * f_p2;
1284 }
else if (radial == -1) {
1285 Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]);
1286 Float f_p2 = c[0] / p2 / rd;
1287 pj[0] = ms[0] - p0 * f_p2;
1288 pj[1] = ms[1] - p1 * f_p2;
1290 pj[0] = ms[0] - p0 * c[0] / p2;
1291 pj[1] = ms[1] - p1 * c[0] / p2;
1297 template <
class Float>
1299 const Float* wq, Float* pj) {
1300 for (
size_t i = 0; i < nq; ++i, qmap += 2, pj += 2, wq += 2) {
1301 const Float* c1 = camera + qmap[0] * 16;
1302 const Float* c2 = camera + qmap[1] * 16;
1303 pj[0] = -(c1[0] - c2[0]) * wq[0];
1304 pj[1] = -(c1[13] - c2[13]) * wq[1];
1308 template <
class Float>
1309 void ComputeJQX(
size_t nq,
const Float* x,
const int* qmap,
const Float* wq,
1310 const Float* sj, Float* jx) {
1312 for (
size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) {
1313 int idx1 = qmap[0] * 8, idx2 = qmap[1] * 8;
1314 const Float* x1 = x + idx1;
1315 const Float* x2 = x + idx2;
1316 const Float* sj1 = sj + idx1;
1317 const Float* sj2 = sj + idx2;
1318 jx[0] = (x1[0] * sj1[0] - x2[0] * sj2[0]) * wq[0];
1319 jx[1] = (x1[7] * sj1[7] - x2[7] * sj2[7]) * wq[1];
1322 for (
size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) {
1323 const Float* x1 = x + qmap[0] * 8;
1324 const Float* x2 = x + qmap[1] * 8;
1325 jx[0] = (x1[0] - x2[0]) * wq[0];
1326 jx[1] = (x1[7] - x2[7]) * wq[1];
1331 template <
class Float>
1333 const Float* wq,
const Float* sj, Float* v) {
1335 for (
size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8, sj += 8) {
1337 if (ip == -1)
continue;
1339 const Float* e1 = pe + ip * 2;
1340 const Float* e2 = pe + in * 2;
1341 v[0] += wq[0] * sj[0] * (e1[0] - e2[0]);
1342 v[7] += wq[1] * sj[7] * (e1[1] - e2[1]);
1345 for (
size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8) {
1347 if (ip == -1)
continue;
1349 const Float* e1 = pe + ip * 2;
1350 const Float* e2 = pe + in * 2;
1351 v[0] += wq[0] * (e1[0] - e2[0]);
1352 v[7] += wq[1] * (e1[1] - e2[1]);
1357 template <
class Float>
1358 inline void JacobianOne(
const Float* c,
const Float* pt,
const Float* ms,
1359 Float* jxc, Float* jyc, Float* jxp, Float* jyp,
1360 bool intrinsic_fixed,
int radial_distortion) {
1361 const Float* r = c + 4;
1362 Float x0 = c[4] * pt[0] + c[5] * pt[1] + c[6] * pt[2];
1363 Float y0 = c[7] * pt[0] + c[8] * pt[1] + c[9] * pt[2];
1364 Float z0 = c[10] * pt[0] + c[11] * pt[1] + c[12] * pt[2];
1365 Float p2 = (z0 + c[3]);
1366 Float f_p2 = c[0] / p2;
1367 Float p0_p2 = (x0 + c[1]) / p2;
1368 Float p1_p2 = (y0 + c[2]) / p2;
1370 if (radial_distortion == 1) {
1371 Float rr1 = c[13] * p0_p2 * p0_p2;
1372 Float rr2 = c[13] * p1_p2 * p1_p2;
1373 Float f_p2_x = Float(f_p2 * (1.0 + 3.0 * rr1 + rr2));
1374 Float f_p2_y = Float(f_p2 * (1.0 + 3.0 * rr2 + rr1));
1376 #ifndef PBA_DISABLE_CONST_CAMERA
1377 if (c[15] != 0.0f) {
1397 Float jfc = intrinsic_fixed ? 0 : Float(1.0 + rr1 + rr2);
1399 intrinsic_fixed ? 0 : c[0] * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
1401 jxc[0] = p0_p2 * jfc;
1404 jxc[3] = -f_p2_x * p0_p2;
1405 jxc[4] = -f_p2_x * p0_p2 * y0;
1406 jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
1407 jxc[6] = -f_p2_x * y0;
1408 jxc[7] = ft_x_pn * p0_p2;
1410 jyc[0] = p1_p2 * jfc;
1413 jyc[3] = -f_p2_y * p1_p2;
1414 jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
1415 jyc[5] = f_p2_y * x0 * p1_p2;
1416 jyc[6] = f_p2_y * x0;
1417 jyc[7] = ft_x_pn * p1_p2;
1423 jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
1424 jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
1425 jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
1426 jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
1427 jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
1428 jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
1429 #ifdef POINT_DATA_ALIGN4
1430 jxp[3] = jyp[3] = 0;
1435 #ifndef PBA_DISABLE_CONST_CAMERA
1436 if (c[15] != 0.0f) {
1456 jxc[0] = intrinsic_fixed ? 0 : p0_p2;
1459 jxc[3] = -f_p2 * p0_p2;
1460 jxc[4] = -f_p2 * p0_p2 * y0;
1461 jxc[5] = f_p2 * (z0 + x0 * p0_p2);
1462 jxc[6] = -f_p2 * y0;
1464 jyc[0] = intrinsic_fixed ? 0 : p1_p2;
1467 jyc[3] = -f_p2 * p1_p2;
1468 jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
1469 jyc[5] = f_p2 * x0 * p1_p2;
1472 if (radial_distortion == -1 && !intrinsic_fixed) {
1473 Float msn = ms[0] * ms[0] + ms[1] * ms[1];
1474 jxc[7] = -ms[0] * msn;
1475 jyc[7] = -ms[1] * msn;
1484 jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
1485 jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
1486 jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
1487 jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
1488 jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
1489 jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
1490 #ifdef POINT_DATA_ALIGN4
1491 jxp[3] = jyp[3] = 0;
1497 template <
class Float>
1499 const Float*
point, Float* jc, Float* jp,
const int* jmap,
1500 const Float* sj,
const Float* ms,
const int* cmlist,
1501 bool intrinsic_fixed,
int radial_distortion,
bool shuffle,
1502 Float* jct,
int mt = 2,
int i0 = 0);
1509 const Float *sj, *ms;
1511 bool intrinsic_fixed;
1512 int radial_distortion;
1518 q->sj, q->ms, q->cmlist, q->intrinsic_fixed,
1519 q->radial_distortion, q->shuffle, q->jct, 0, q->i0);
1522 template <class Float>
1524 const Float*
point, Float* jc, Float* jp, const
int* jmap,
1525 const Float* sj, const Float* ms, const
int* cmlist,
1526 bool intrinsic_fixed,
int radial_distortion,
bool shuffle,
1527 Float* jct,
int mt,
int i0) {
1528 if (mt > 1 && nproj >= mt) {
1531 for (
size_t i = 0; i < thread_num; ++i) {
1532 size_t first = nproj * i / thread_num;
1533 size_t last_ = nproj * (i + 1) / thread_num;
1534 size_t last =
std::min(last_, nproj);
1536 jmap + 2 * first, sj, ms + 2 * first, cmlist + first,
1537 intrinsic_fixed, radial_distortion, shuffle, jct, first);
1541 const Float* sjc0 = sj;
1542 const Float* sjp0 = sj ? sj + ncam * 8 :
NULL;
1544 for (
size_t i = i0; i < nproj; ++i, jmap += 2, ms += 2, ++cmlist) {
1545 int cidx = jmap[0], pidx = jmap[1];
1547 Float* jci = jc ? (jc + (shuffle ? cmlist[0] : i) * 16) :
NULL;
1552 intrinsic_fixed, radial_distortion);
1558 ScaleJ8(jci, jci + 8, sjc0 + cidx * 8);
1562 for (
int j = 0; j < 3; ++j) {
1569 if (jct && jc)
MemoryCopyB(jci, jci + 16, jct + cmlist[0] * 16);
1574 template <
class Float>
1576 const Float* sj =
NULL) {
1578 for (
size_t i = 0; i < ncam; ++i, qw += 2, d += 8, sj += 8) {
1579 if (qw[0] == 0)
continue;
1580 Float j1 = qw[0] * sj[0];
1581 Float j2 = qw[1] * sj[7];
1582 d[0] += (j1 * j1 * 2.0f);
1583 d[7] += (j2 * j2 * 2.0f);
1586 for (
size_t i = 0; i < ncam; ++i, qw += 2, d += 8) {
1587 if (qw[0] == 0)
continue;
1588 d[0] += (qw[0] * qw[0] * 2.0f);
1589 d[7] += (qw[1] * qw[1] * 2.0f);
1595 template <
class Float>
1598 const vector<int>& cmlistv,
const Float* qw0,
1599 avec<Float>& jtjdi,
bool jc_transpose,
int radial) {
1601 if (jcv.
size() == 0 || jpv.
size() == 0)
return;
1603 size_t ncam = cmapv.size() - 1, npts = pmapv.size() - 1;
1604 const int vn = radial ? 8 : 7;
1607 const int* cmap = &cmapv[0];
1608 const int* pmap = &pmapv[0];
1609 const int* cmlist = &cmlistv[0];
1610 const Float* jc = &jcv[0];
1611 const Float* jp = &jpv[0];
1612 const Float* qw = qw0;
1613 Float* jji = &jtjdi[0];
1616 for (
size_t i = 0; i < ncam; ++i, jji += 8, ++cmap, qw += 2) {
1617 int idx1 = cmap[0], idx2 = cmap[1];
1619 for (
int j = idx1; j < idx2; ++j) {
1620 int idx = jc_transpose ? j : cmlist[j];
1621 const Float* pj = jc + idx * 16;
1623 for (
int k = 0; k < vn; ++k)
1624 jji[k] += (pj[k] * pj[k] + pj[k + 8] * pj[k + 8]);
1626 if (qw0 && qw[0] > 0) {
1627 jji[0] += (qw[0] * qw[0] * 2.0f);
1628 jji[7] += (qw[1] * qw[1] * 2.0f);
1632 for (
size_t i = 0; i < npts; ++i, jji +=
POINT_ALIGN, ++pmap) {
1633 int idx1 = pmap[0], idx2 = pmap[1];
1635 for (
int j = idx1; j < idx2; ++j, pj +=
POINT_ALIGN2) {
1636 for (
int k = 0; k < 3; ++k)
1640 Float* it = jtjdi.
begin();
1641 for (; it < jtjdi.
end(); ++it) {
1642 *it = (*it == 0) ? 0 : Float(1.0 / (*it));
1646 template <
class T,
int n,
int m>
1648 for (
int i = 0; i < n; ++i) {
1650 a[i][i] = sqrt(a[i][i]);
1651 for (
int j = i + 1; j < n; ++j) a[j][i] = a[j][i] / a[i][i];
1652 for (
int j = i + 1; j < n; ++j)
1653 for (
int k = j; k < n; ++k) a[k][j] -= a[k][i] * a[j][i];
1658 for (
int i = 0; i < n; ++i) {
1659 if (a[i][i] == 0)
continue;
1660 a[i][i] = 1.0f / a[i][i];
1662 for (
int i = 1; i < n; ++i) {
1663 if (a[i][i] == 0)
continue;
1664 for (
int j = 0; j < i; ++j) {
1666 for (
int k = j; k < i; ++k) sum += (a[i][k] * a[k][j]);
1667 a[i][j] = -sum * a[i][i];
1672 for (
int i = 0; i < n; ++i) {
1673 for (
int j = i; j < n; ++j) {
1675 for (
int k = j; k < n; ++k) ai[i][j] += a[k][i] * a[k][j];
1676 ai[j][i] = ai[i][j];
1680 template <
class T,
int n,
int m>
1682 InvertSymmetricMatrix<T, n, m>((T(*)[m])a, (T(*)[m])ai);
1685 template <
class Float>
1687 const Float* jc,
const int* cmap,
const int* cmlist,
1688 Float* di, Float* bi,
int vn,
bool jc_transpose,
1689 bool use_jq,
int mt);
1693 float lambda1, lambda2;
1695 const
int *cmap, *cmlist;
1698 bool jc_transpose, use_jq;
1701 q->cmlist, q->di, q->bi, q->vn, q->jc_transpose,
1705 template <class Float>
1707 const Float* jc, const
int* cmap, const
int* cmlist,
1708 Float* di, Float* bi,
int vn,
bool jc_transpose,
1709 bool use_jq,
int mt) {
1710 const size_t bc = vn * 8;
1712 if (mt > 1 && ncam >= (
size_t)mt) {
1715 for (
size_t i = 0; i < thread_num; ++i) {
1716 size_t first = ncam * i / thread_num;
1717 size_t last_ = ncam * (i + 1) / thread_num;
1718 size_t last =
std::min(last_, ncam);
1720 lambda2, jc, cmap + first, cmlist, di + 8 * first,
1721 bi + bc * first, vn, jc_transpose, use_jq);
1730 for (
size_t i = 0; i < ncam; ++i, ++cmap, bi += bc) {
1731 int idx1 = cmap[0], idx2 = cmap[1];
1738 for (
int j = idx1; j < idx2; ++j) {
1739 int idx = jc_transpose ? j : cmlist[j];
1740 const Float* pj = jc + idx * 16;
1750 for (
int j = 0; j < 8; ++j, ++di, pb += 9) {
1752 di[0] = temp = (di[0] + pb[0]);
1753 pb[0] = lambda2 * temp + lambda1;
1757 for (
int j = 0; j < 8; ++j, ++di, pb += 9) {
1758 *pb = lambda2 * ((*di) = (*pb)) + lambda1;
1764 InvertSymmetricMatrix<Float, 8, 8>(pbuf, bi);
1766 InvertSymmetricMatrix<Float, 7, 8>(pbuf, bi);
1772 template <
class Float>
1774 const Float* jp,
const int* pmap, Float* di,
1779 float lambda1, lambda2;
1788 template <class Float>
1790 const Float* jp, const
int* pmap, Float* di,
1791 Float* bi,
int mt) {
1795 for (
size_t i = 0; i < thread_num; ++i) {
1796 size_t first = npt * i / thread_num;
1797 size_t last_ = npt * (i + 1) / thread_num;
1798 size_t last =
std::min(last_, npt);
1800 lambda2, jp, pmap + first, di +
POINT_ALIGN * first,
1805 for (
size_t i = 0; i < npt; ++i, ++pmap, di +=
POINT_ALIGN, bi += 6) {
1806 int idx1 = pmap[0], idx2 = pmap[1];
1808 Float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
1810 for (
int j = idx1; j < idx2;
1812 M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
1813 M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
1814 M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
1815 M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
1816 M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
1817 M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
1826 M00 = M00 * lambda2 + lambda1;
1827 M11 = M11 * lambda2 + lambda1;
1828 M22 = M22 * lambda2 + lambda1;
1831 Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 -
1832 M02 * M02 * M11 - M12 * M12 * M00;
1833 if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
1835 for (
int j = 0; j < 6; ++j) bi[j] = 0;
1837 bi[0] = (M11 * M22 - M12 * M12) / det;
1838 bi[1] = -(M01 * M22 - M12 * M02) / det;
1839 bi[2] = (M01 * M12 - M02 * M11) / det;
1840 bi[3] = (M00 * M22 - M02 * M02) / det;
1841 bi[4] = -(M00 * M12 - M01 * M02) / det;
1842 bi[5] = (M00 * M11 - M01 * M01) / det;
1848 template <
class Float>
1850 const Float* jc,
const int* cmap,
const Float* jp,
1851 const int* pmap,
const int* cmlist,
const Float* sj,
1852 const Float* wq, Float* diag, Float* blocks,
1853 int radial_distortion,
bool jc_transpose,
int mt1 = 2,
1854 int mt2 = 2,
int mode = 0) {
1855 const int vn = radial_distortion ? 8 : 7;
1856 const size_t bc = vn * 8;
1857 float lambda1 = dampd ? 0.0f : lambda;
1858 float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
1861 const size_t bsz = bc * ncam + npts * 6;
1863 bool use_jq = wq !=
NULL;
1871 blocks, vn, jc_transpose, use_jq, mt1);
1873 blocks + bc * ncam, mt2);
1874 }
else if (mode == 1) {
1875 const size_t bsz = bc * ncam;
1876 const size_t dsz = 8 * ncam;
1877 bool use_jq = wq !=
NULL;
1885 blocks, vn, jc_transpose, use_jq, mt1);
1887 blocks += bc * ncam;
1889 const size_t bsz = npts * 6;
1900 template <
class Float>
1906 int radial_distortion,
int mode = 0) {
1907 const int vn = radial_distortion ? 8 : 7;
1908 const size_t szbc = vn * 8;
1909 size_t ncam = camerav.
size() / 16;
1911 size_t sz_jcd = ncam * 8;
1912 size_t sz_jcb = ncam * szbc;
1917 float lambda1 = dampd ? 0.0f : lambda;
1918 float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
1920 Float jbufv[24 + 8];
1923 Float *jyc = jxc + 8, *jxp = jxc + 16, *jyp = jxc + 20;
1926 const int* jmap = &jmapv[0];
1927 const Float* camera = &camerav[0];
1928 const Float*
point = &pointv[0];
1929 const Float* ms = &meas[0];
1930 const Float* sjc0 = sjv.
size() ? &sjv[0] :
NULL;
1931 const Float* sjp0 = sjv.
size() ? &sjv[sz_jcd] :
NULL;
1933 Float *blockpc = &blockpv[0], *blockpp = &blockpv[sz_jcb];
1934 Float *bo = blockpc, *bi = &blocks[0], *di = &diag[0];
1938 for (
size_t i = 0; i < jmapv.size(); i += 2, jmap += 2, ms += 2) {
1939 int cidx = jmap[0], pidx = jmap[1];
1942 JacobianOne(c, pt, ms, jxc, jyc, jxp, jyp, intrinsic_fixed,
1948 const Float* sjc = sjc0 + cidx * 8;
1952 Float* bc = blockpc + cidx * szbc;
1969 Float* bp = blockpp + pidx * 6;
1970 bp[0] += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
1971 bp[1] += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
1972 bp[2] += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
1973 bp[3] += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
1974 bp[4] += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
1975 bp[5] += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
1982 const Float* qw = qwv.
begin();
1984 for (
size_t i = 0; i < ncam; ++i, qw += 2) {
1985 if (qw[0] == 0)
continue;
1986 Float* bc = blockpc + i * szbc;
1988 const Float* sjc = sjc0 + i * 8;
1989 Float j1 = sjc[0] * qw[0];
1990 Float j2 = sjc[7] * qw[1];
1991 bc[0] += (j1 * j1 * 2.0f);
1992 if (radial_distortion) bc[63] += (j2 * j2 * 2.0f);
1994 const Float* sjc = sjc0 + i * 8;
1995 bc[0] += (qw[0] * qw[0] * 2.0f);
1996 if (radial_distortion) bc[63] += (qw[1] * qw[1] * 2.0f);
2001 for (
size_t i = 0; i < ncam; ++i, bo += szbc, bi += szbc, di += 8) {
2002 Float *bp = bo, *dip = di;
2003 for (
int j = 0; j < vn; ++j, ++dip, bp += 9) {
2005 bp[0] = lambda2 * bp[0] + lambda1;
2009 if (radial_distortion)
2010 InvertSymmetricMatrix<Float, 8, 8>(bo, bi);
2012 InvertSymmetricMatrix<Float, 7, 8>(bo, bi);
2023 for (
size_t i = 0; i < npts; ++i, bo += 6, bi += 6, di +=
POINT_ALIGN) {
2024 Float &M00 = bo[0], &M01 = bo[1], &M02 = bo[2];
2025 Float &M11 = bo[3], &M12 = bo[4], &M22 = bo[5];
2031 M00 = M00 * lambda2 + lambda1;
2032 M11 = M11 * lambda2 + lambda1;
2033 M22 = M22 * lambda2 + lambda1;
2036 Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 -
2037 M02 * M02 * M11 - M12 * M12 * M00;
2038 if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
2039 for (
int j = 0; j < 6; ++j) bi[j] = 0;
2041 bi[0] = (M11 * M22 - M12 * M12) / det;
2042 bi[1] = -(M01 * M22 - M12 * M02) / det;
2043 bi[2] = (M01 * M12 - M02 * M11) / det;
2044 bi[3] = (M00 * M22 - M02 * M02) / det;
2045 bi[4] = -(M00 * M12 - M01 * M02) / det;
2046 bi[5] = (M00 * M11 - M01 * M01) / det;
2052 template <
class Float>
2054 Float* vx,
int vn,
int mt = 0);
2058 const Float *bi, *x;
2065 template <class Float>
2067 Float* vx,
int vn,
int mt) {
2068 if (mt > 1 && ncam >= mt) {
2069 const size_t bc = vn * 8;
2072 for (
int i = 0; i < thread_num; ++i) {
2073 int first = ncam * i / thread_num;
2074 int last_ = ncam * (i + 1) / thread_num;
2077 bi + first * bc, x + 8 * first, vx + 8 * first, vn);
2081 for (
int i = 0; i < ncam; ++i, x += 8, vx += 8) {
2083 for (
int j = 0; j < vn; ++j, bi += 8, ++vxc) *vxc =
DotProduct8(bi, x);
2088 template <
class Float>
2090 Float* vx,
int mt = 0);
2094 const Float *bi, *x;
2100 template <class Float>
2102 Float* vx,
int mt) {
2103 if (mt > 1 && npoint >= mt) {
2106 for (
int i = 0; i < thread_num; ++i) {
2107 int first = npoint * i / thread_num;
2108 int last_ = npoint * (i + 1) / thread_num;
2109 int last =
std::min(last_, npoint);
2116 for (
int i = 0; i < npoint;
2118 vx[0] = (bi[0] * x[0] + bi[1] * x[1] + bi[2] * x[2]);
2119 vx[1] = (bi[1] * x[0] + bi[3] * x[1] + bi[4] * x[2]);
2120 vx[2] = (bi[2] * x[0] + bi[4] * x[1] + bi[5] * x[2]);
2125 template <
class Float>
2127 const Float* vec, Float* resultv,
int radial,
2128 int mode,
int mt1,
int mt2) {
2129 const int vn = radial ? 8 : 7;
2135 resultv + 8 * ncam, mt2);
2138 template <
class Float>
2139 void ComputeJX(
size_t nproj,
size_t ncam,
const Float* x,
const Float* jc,
2140 const Float* jp,
const int* jmap, Float* jx,
int mode,
2145 const Float *xc, *jc, *jp;
2150 ComputeJX(q->nproj, q->ncam, q->xc, q->jc, q->jp, q->jmap, q->jx, q->mode, 0);
2153 template <class Float>
2154 void ComputeJX(
size_t nproj,
size_t ncam, const Float* x, const Float* jc,
2155 const Float* jp, const
int* jmap, Float* jx,
int mode,
int mt) {
2156 if (mt > 1 && nproj >= mt) {
2159 for (
size_t i = 0; i < thread_num; ++i) {
2160 size_t first = nproj * i / thread_num;
2161 size_t last_ = nproj * (i + 1) / thread_num;
2162 size_t last =
std::min(last_, nproj);
2164 jc + 16 * first, jp +
POINT_ALIGN2 * first, jmap + first * 2,
2165 jx + first * 2, mode);
2168 }
else if (mode == 0) {
2169 const Float *pxc = x, *pxp = pxc + ncam * 8;
2171 for (
size_t i = 0; i < nproj;
2172 ++i, jmap += 2, jc += 16, jp +=
POINT_ALIGN2, jx += 2) {
2175 }
else if (mode == 1) {
2176 const Float* pxc = x;
2178 for (
size_t i = 0; i < nproj;
2179 ++i, jmap += 2, jc += 16, jp +=
POINT_ALIGN2, jx += 2) {
2180 const Float* xc = pxc + jmap[0] * 8;
2184 }
else if (mode == 2) {
2185 const Float* pxp = x + ncam * 8;
2187 for (
size_t i = 0; i < nproj;
2188 ++i, jmap += 2, jc += 16, jp +=
POINT_ALIGN2, jx += 2) {
2190 jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
2191 jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
2196 template <
class Float>
2197 void ComputeJX_(
size_t nproj,
size_t ncam,
const Float* x, Float* jx,
2198 const Float* camera,
const Float*
point,
const Float* ms,
2199 const Float* sj,
const int* jmap,
bool intrinsic_fixed,
2200 int radial_distortion,
int mode,
int mt = 16);
2208 bool intrinsic_fixed;
2209 int radial_distortion;
2212 ComputeJX_(q->nproj, q->ncam, q->x, q->jx, q->camera, q->
point, q->ms, q->sj,
2213 q->jmap, q->intrinsic_fixed, q->radial_distortion, q->mode, 0);
2216 template <class Float>
2217 void ComputeJX_(
size_t nproj,
size_t ncam, const Float* x, Float* jx,
2218 const Float* camera, const Float*
point, const Float* ms,
2219 const Float* sj, const
int* jmap,
bool intrinsic_fixed,
2220 int radial_distortion,
int mode,
int mt) {
2221 if (mt > 1 && nproj >= mt) {
2224 for (
size_t i = 0; i < thread_num; ++i) {
2225 size_t first = nproj * i / thread_num;
2226 size_t last_ = nproj * (i + 1) / thread_num;
2227 size_t last =
std::min(last_, nproj);
2229 jx + first * 2, camera,
point, ms + 2 * first, sj,
2230 jmap + first * 2, intrinsic_fixed, radial_distortion, mode);
2233 }
else if (mode == 0) {
2236 Float *jc = (Float *)
ALIGN_PTR(jcv), *jp = jc + 16;
2238 const Float* sjc = sj;
2239 const Float* sjp = sjc ? (sjc + ncam * 8) :
NULL;
2240 const Float *xc0 = x, *xp0 = x + ncam * 8;
2243 for (
size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
2244 const int cidx = jmap[0], pidx = jmap[1];
2251 ScaleJ8(jc, jc + 8, sjc + cidx * 8);
2253 for (
int j = 0; j < 3; ++j) {
2261 }
else if (mode == 1) {
2267 const Float *sjc = sj, *xc0 = x;
2270 for (
size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
2271 const int cidx = jmap[0], pidx = jmap[1];
2275 intrinsic_fixed, radial_distortion);
2276 if (sjc)
ScaleJ8(jc, jc + 8, sjc + cidx * 8);
2277 const Float* xc = xc0 + cidx * 8;
2281 }
else if (mode == 2) {
2285 const Float* sjp = sj ? (sj + ncam * 8) :
NULL;
2286 const Float* xp0 = x + ncam * 8;
2289 for (
size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
2290 const int cidx = jmap[0], pidx = jmap[1];
2294 intrinsic_fixed, radial_distortion);
2299 jx[0] = (jp[0] * xp[0] * s[0] + jp[1] * xp[1] * s[1] +
2300 jp[2] * xp[2] * s[2]);
2301 jx[1] = (jp[3] * xp[0] * s[0] + jp[4] * xp[1] * s[1] +
2302 jp[5] * xp[2] * s[2]);
2304 jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
2305 jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
2311 template <
class Float>
2312 void ComputeJtEC(
size_t ncam,
const Float* pe,
const Float* jc,
const int* cmap,
2313 const int* cmlist, Float* v,
bool jc_transpose,
int mt);
2317 const Float *pe, *jc;
2318 const
int *cmap, *cmlist;
2322 ComputeJtEC(q->ncam, q->pe, q->jc, q->cmap, q->cmlist, q->v, q->jc_transpose,
2326 template <class Float>
2327 void ComputeJtEC(
size_t ncam, const Float* pe, const Float* jc, const
int* cmap,
2328 const
int* cmlist, Float* v,
bool jc_transpose,
int mt) {
2329 if (mt > 1 && ncam >= mt) {
2332 for (
size_t i = 0; i < thread_num; ++i) {
2333 size_t first = ncam * i / thread_num;
2334 size_t last_ = ncam * (i + 1) / thread_num;
2335 size_t last =
std::min(last_, ncam);
2337 cmlist, v + 8 * first, jc_transpose);
2342 for (
size_t i = 0; i < ncam; ++i, ++cmap, v += 8) {
2343 int idx1 = cmap[0], idx2 = cmap[1];
2344 for (
int j = idx1; j < idx2; ++j) {
2345 int edx = cmlist[j];
2346 const Float* pj = jc + ((jc_transpose ? j : edx) * 16);
2347 const Float* e = pe + edx * 2;
2356 template <
class Float>
2357 void ComputeJtEP(
size_t npt,
const Float* pe,
const Float* jp,
const int* pmap,
2362 const Float *pe, *jp;
2366 ComputeJtEP(q->npt, q->pe, q->jp, q->pmap, q->v, 0);
2369 template <class Float>
2370 void ComputeJtEP(
size_t npt, const Float* pe, const Float* jp, const
int* pmap,
2372 if (mt > 1 && npt >= mt) {
2375 for (
size_t i = 0; i < thread_num; ++i) {
2376 size_t first = npt * i / thread_num;
2377 size_t last_ = npt * (i + 1) / thread_num;
2378 size_t last =
std::min(last_, npt);
2384 for (
size_t i = 0; i < npt; ++i, ++pmap, v +=
POINT_ALIGN) {
2385 int idx1 = pmap[0], idx2 = pmap[1];
2387 const Float* e = pe + idx1 * 2;
2388 Float temp[3] = {0, 0, 0};
2389 for (
int j = idx1; j < idx2; ++j, pj +=
POINT_ALIGN2, e += 2) {
2390 temp[0] += (e[0] * pj[0] + e[1] * pj[
POINT_ALIGN]);
2391 temp[1] += (e[0] * pj[1] + e[1] * pj[
POINT_ALIGN + 1]);
2392 temp[2] += (e[0] * pj[2] + e[1] * pj[
POINT_ALIGN + 2]);
2401 template <
class Float>
2402 void ComputeJtE(
size_t ncam,
size_t npt,
const Float* pe,
const Float* jc,
2403 const int* cmap,
const int* cmlist,
const Float* jp,
2404 const int* pmap, Float* v,
bool jc_transpose,
int mode,
int mt1,
2408 ComputeJtEC(ncam, pe, jc, cmap, cmlist, v, jc_transpose, mt1);
2411 ComputeJtEP(npt, pe, jp, pmap, v + 8 * ncam, mt2);
2415 template <
class Float>
2416 void ComputeJtEC_(
size_t ncam,
const Float* ee, Float* jte,
const Float* c,
2417 const Float*
point,
const Float* ms,
const int* jmap,
2418 const int* cmap,
const int* cmlist,
bool intrinsic_fixed,
2419 int radial_distortion,
int mt);
2426 const
int *jmap, *cmap, *cmlist;
2427 bool intrinsic_fixed;
2428 int radial_distortion;
2431 q->cmlist, q->intrinsic_fixed, q->radial_distortion, 0);
2434 template <class Float>
2435 void ComputeJtEC_(
size_t ncam, const Float* ee, Float* jte, const Float* c,
2436 const Float*
point, const Float* ms, const
int* jmap,
2437 const
int* cmap, const
int* cmlist,
bool intrinsic_fixed,
2438 int radial_distortion,
int mt) {
2439 if (mt > 1 && ncam >= mt) {
2443 for (
size_t i = 0; i < thread_num; ++i) {
2444 size_t first = ncam * i / thread_num;
2445 size_t last_ = ncam * (i + 1) / thread_num;
2446 size_t last =
std::min(last_, ncam);
2448 c + first * 16,
point, ms, jmap, cmap + first, cmlist,
2449 intrinsic_fixed, radial_distortion);
2457 Float *jcx = (Float *)
ALIGN_PTR(jcv), *jcy = jcx + 8;
2459 for (
size_t i = 0; i < ncam; ++i, ++cmap, jte += 8, c += 16) {
2460 int idx1 = cmap[0], idx2 = cmap[1];
2462 for (
int j = idx1; j < idx2; ++j) {
2463 int index = cmlist[j];
2465 const Float* e = ee + index * 2;
2468 intrinsic_fixed, radial_distortion);
2478 template <
class Float>
2479 void ComputeJtE_(
size_t nproj,
size_t ncam,
size_t npt,
const Float* ee,
2480 Float* jte,
const Float* camera,
const Float*
point,
2481 const Float* ms,
const int* jmap,
const int* cmap,
2482 const int* cmlist,
const int* pmap,
const Float* jp,
2483 bool intrinsic_fixed,
int radial_distortion,
int mode,
2488 intrinsic_fixed, radial_distortion, mt);
2491 ComputeJtEP(npt, ee, jp, pmap, jte + 8 * ncam, mt);
2495 template <
class Float>
2496 void ComputeJtE_(
size_t nproj,
size_t ncam,
size_t npt,
const Float* ee,
2497 Float* jte,
const Float* camera,
const Float*
point,
2498 const Float* ms,
const int* jmap,
bool intrinsic_fixed,
2499 int radial_distortion,
int mode) {
2503 Float *jc = (Float *)
ALIGN_PTR(jcv), *pj = jc + 16;
2505 Float *vc0 = jte, *vp0 = jte + ncam * 8;
2507 for (
size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, ee += 2) {
2508 int cidx = jmap[0], pidx = jmap[1];
2517 Float *vc = vc0 + cidx * 8, *vp = vp0 + pidx *
POINT_ALIGN;
2520 vp[0] += (ee[0] * pj[0] + ee[1] * pj[
POINT_ALIGN]);
2521 vp[1] += (ee[0] * pj[1] + ee[1] * pj[
POINT_ALIGN + 1]);
2522 vp[2] += (ee[0] * pj[2] + ee[1] * pj[
POINT_ALIGN + 2]);
2523 }
else if (mode == 1) {
2526 intrinsic_fixed, radial_distortion);
2529 Float* vc = vc0 + cidx * 8;
2535 intrinsic_fixed, radial_distortion);
2539 vp[0] += (ee[0] * pj[0] + ee[1] * pj[
POINT_ALIGN]);
2540 vp[1] += (ee[0] * pj[1] + ee[1] * pj[
POINT_ALIGN + 1]);
2541 vp[2] += (ee[0] * pj[2] + ee[1] * pj[
POINT_ALIGN + 2]);
2547 using namespace ProgramCPU;
2549 template <
class Float>
2561 _projection_sse(0) {
2563 if (num_threads <= 0) {
2571 #ifdef CPUPBA_USE_AVX
2613 template <
class Float>
2615 if (
sizeof(
CameraT) != 16 *
sizeof(
float))
return;
2616 _num_camera = (int)ncam;
2617 _camera_data = cams;
2621 template <
class Float>
2623 _focal_mask = fmask;
2627 template <
class Float>
2629 _num_point = (int)npoint;
2630 _point_data = (
float*)pts;
2633 template <
class Float>
2635 const int* point_idx,
2636 const int* cam_idx) {
2637 _num_imgpt = (int)nproj;
2638 _imgpt_data = (
float*)imgpts;
2639 _camera_idx = cam_idx;
2640 _point_idx = point_idx;
2643 template <
class Float>
2645 return float(_projection_sse /
2646 (_num_imgpt * __focal_scaling * __focal_scaling));
2649 template <
class Float>
2651 ResetBundleStatistics();
2653 if (__num_lm_success > 0)
2654 SaveBundleStatistics(_num_camera, _num_point, _num_imgpt);
2655 if (__num_lm_success > 0) PrintBundleStatistics();
2656 ResetTemporarySetting();
2657 return __num_lm_success;
2660 template <
class Float>
2662 if (_camera_data ==
NULL)
return STATUS_CAMERA_MISSING;
2663 if (_point_data ==
NULL)
return STATUS_POINT_MISSING;
2664 if (_imgpt_data ==
NULL)
return STATUS_MEASURMENT_MISSING;
2665 if (_camera_idx ==
NULL || _point_idx ==
NULL)
2666 return STATUS_PROJECTION_MISSING;
2667 return STATUS_SUCCESS;
2670 template <
class Float>
2674 InitializeStorageForSFM();
2675 InitializeStorageForCG();
2677 if (__debug_pba) DumpCooJacobian();
2679 return STATUS_SUCCESS;
2682 template <
class Float>
2684 return _num_camera * 8 +
POINT_ALIGN * _num_point;
2687 template <
class Float>
2689 if (ValidateInputData() != STATUS_SUCCESS)
return;
2695 if (InitializeBundle() != STATUS_SUCCESS) {
2697 }
else if (__profile_pba) {
2702 AdjustBundleAdjsutmentMode();
2703 NonlinearOptimizeLM();
2704 TransferDataToHost();
2709 template <
class Float>
2716 template <
class Float>
2719 std::copy(_cuCameraData.begin(), _cuCameraData.end(), ((
float*)_camera_data));
2720 #ifdef POINT_DATA_ALIGN4
2721 std::copy(_cuPointData.begin(), _cuPointData.end(), _point_data);
2723 for (
size_t i = 0, j = 0; i < _cuPointData.size(); j++) {
2724 _point_data[j++] = (float)_cuPointData[i++];
2725 _point_data[j++] = (float)_cuPointData[i++];
2726 _point_data[j++] = (float)_cuPointData[i++];
2731 #define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \
2733 NAME.resize((num) * (channels)); \
2734 total_sz += NAME.size() * sizeof(Float); \
2736 #define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \
2737 if (option) ALLOCATE_REQUIRED_DATA(NAME, num, channels) else { \
2741 template <
class Float>
2743 size_t total_sz = 0;
2745 ProcessIndexCameraQ(_cuCameraQMap, _cuCameraQList);
2746 total_sz += ((_cuCameraQMap.size() + _cuCameraQList.size()) *
sizeof(
int) /
2767 !__no_jacobian_store);
2769 !__no_jacobian_store && __jc_store_transpose);
2771 !__no_jacobian_store && __jc_store_original);
2773 __jc_store_transpose);
2776 BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
2778 vector<int>& cpi = _cuCameraMeasurementMap;
2779 cpi.resize(_num_camera + 1);
2780 vector<int>& cpidx = _cuCameraMeasurementList;
2781 cpidx.resize(_num_imgpt);
2782 vector<int> cpnum(_num_camera, 0);
2784 for (
int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++;
2785 for (
int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1];
2787 vector<int> cptidx = cpi;
2788 for (
int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i;
2791 if (_cuCameraMeasurementListT.size()) {
2792 vector<int>& ridx = _cuCameraMeasurementListT;
2793 ridx.resize(_num_imgpt);
2794 for (
int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i;
2799 if (_num_imgpt_q > 0)
2800 ProcessWeightCameraQ(cpnum, _cuCameraQMap, _cuCameraQMapW.begin(),
2801 _cuCameraQListW.begin());
2804 std::copy((
float*)_camera_data, ((
float*)_camera_data) + _cuCameraData.size(),
2805 _cuCameraData.begin());
2807 #ifdef POINT_DATA_ALIGN4
2808 std::copy(_point_data, _point_data + _cuPointData.size(),
2809 _cuPointData.begin());
2811 for (
size_t i = 0, j = 0; i < _cuPointData.size(); j++) {
2812 _cuPointData[i++] = _point_data[j++];
2813 _cuPointData[i++] = _point_data[j++];
2814 _cuPointData[i++] = _point_data[j++];
2820 vector<int>& ppi = _cuPointMeasurementMap;
2821 ppi.resize(_num_point + 1);
2822 for (
int i = 0, last_point = -1; i < _num_imgpt; ++i) {
2823 int pt = _point_idx[i];
2824 while (last_point < pt) ppi[++last_point] = i;
2826 ppi[_num_point] = _num_imgpt;
2829 vector<int>& pmp = _cuProjectionMap;
2830 pmp.resize(_num_imgpt * 2);
2831 for (
int i = 0; i < _num_imgpt; ++i) {
2832 int* imp = &pmp[i * 2];
2833 imp[0] = _camera_idx[i];
2834 imp[1] = _point_idx[i];
2836 BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
2839 __memory_usage = total_sz;
2840 if (__verbose_level > 1)
2841 std::cout <<
"Memory for Motion/Structure/Jacobian:\t"
2842 << (total_sz / 1024 / 1024) <<
"MB\n";
2847 template <
class Float>
2849 vector<int>& qlist) {
2855 if (_camera_idx ==
NULL)
return true;
2856 if (_point_idx ==
NULL)
return true;
2857 if (_focal_mask ==
NULL)
return true;
2858 if (_num_camera == 0)
return true;
2859 if (_weight_q <= 0)
return true;
2864 vector<int> temp(_num_camera * 2, -1);
2866 for (
int i = 0; i < _num_camera; ++i) {
2867 int iq = _focal_mask[i];
2872 if (iq < 0)
continue;
2873 if (iq == i)
continue;
2874 int ip = temp[2 * iq];
2881 if (_focal_mask[iq] != iq) {
2884 }
else if (ip == -1) {
2886 temp[2 * iq + 1] = i;
2888 temp[2 * i + 1] = iq;
2892 temp[2 * i + 1] = iq;
2893 temp[2 * ip + 1] = i;
2899 std::cout <<
"Error: incorrect constraints\n";
2905 qlist.resize(_num_camera * 2, -1);
2906 for (
int i = 0; i < _num_camera; ++i) {
2907 int inext = temp[2 * i + 1];
2908 if (inext == -1)
continue;
2909 qlist[2 * i] = _num_imgpt_q;
2910 qlist[2 * inext + 1] = _num_imgpt_q;
2912 qmap.push_back(inext);
2918 template <
class Float>
2921 Float* qmapw, Float* qlistw) {
2923 vector<Float> qpnum(_num_camera, 0), qcnum(_num_camera, 0);
2924 vector<Float> fs(_num_camera, 0), rs(_num_camera, 0);
2926 for (
int i = 0; i < _num_camera; ++i) {
2927 int qi = _focal_mask[i];
2928 if (qi == -1)
continue;
2931 fs[qi] += _camera_data[i].f;
2932 rs[qi] += _camera_data[i].radial;
2933 qpnum[qi] += cpnum[i];
2938 for (
int i = 0; i < _num_camera; ++i) {
2939 int qi = _focal_mask[i];
2940 if (qi == -1)
continue;
2943 _camera_data[i].f = fs[qi] / qcnum[qi];
2944 _camera_data[i].radial = rs[qi] / qcnum[qi];
2948 std::fill(qlistw, qlistw + _num_camera * 2, 0);
2950 for (
int i = 0; i < _num_imgpt_q; ++i) {
2951 int cidx = qmap[i * 2], qi = _focal_mask[cidx];
2952 Float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q;
2953 Float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0);
2955 qmapw[i * 2 + 1] = wr;
2956 qlistw[cidx * 2] = wi;
2957 qlistw[cidx * 2 + 1] = wr;
2962 template <
class Float>
2964 size_t total_sz = 0;
2965 int plen = GetParameterLength();
2976 unsigned int cblock_len = (__use_radial_distortion ? 64 : 56);
2983 __memory_usage += total_sz;
2984 if (__verbose_level > 1)
2985 std::cout <<
"Memory for Conjugate Gradient Solver:\t"
2986 << (total_sz / 1024 / 1024) <<
"MB\n";
2991 template <
class Float>
2993 if (!_cuVectorSJ.size())
return;
2995 if ((__jc_store_transpose || __jc_store_original) &&
2996 _cuJacobianPoint.size() && !__bundle_current_mode) {
2998 null.
swap(_cuVectorSJ);
2999 EvaluateJacobians();
3000 null.swap(_cuVectorSJ);
3005 null.
swap(_cuVectorSJ);
3006 EvaluateJacobians();
3007 ComputeBlockPC(0,
true);
3008 null.swap(_cuVectorSJ);
3009 _cuVectorJJ.swap(_cuVectorSJ);
3014 template <
class Float>
3016 if (__no_jacobian_store)
return;
3017 if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original &&
3018 !__jc_store_transpose)
3023 if (__jc_store_original || !__jc_store_transpose) {
3024 int fid = __jc_store_original
3025 ? (__jc_store_transpose ? FUNC_JJ_JCO_JCT_JP : FUNC_JJ_JCO_JP)
3028 _num_imgpt, _num_camera, _cuCameraData.begin(), _cuPointData.begin(),
3029 _cuJacobianCamera.begin(), _cuJacobianPoint.begin(),
3030 &_cuProjectionMap.front(), _cuVectorSJ.begin(), _cuMeasurements.begin(),
3031 __jc_store_transpose ? &_cuCameraMeasurementListT.front() :
NULL,
3032 __fixed_intrinsics, __use_radial_distortion,
false,
3033 _cuJacobianCameraT.begin(), __num_cpu_thread[fid]);
3036 _cuPointData.begin(), _cuJacobianCameraT.begin(),
3037 _cuJacobianPoint.begin(), &_cuProjectionMap.front(),
3038 _cuVectorSJ.begin(), _cuMeasurements.begin(),
3039 &_cuCameraMeasurementListT.front(), __fixed_intrinsics,
3040 __use_radial_distortion,
true, ((Float*)0),
3041 __num_cpu_thread[FUNC_JJ_JCT_JP]);
3043 ++__num_jacobian_eval;
3046 template <
class Float>
3049 if (mode == 0) mode = __bundle_current_mode;
3051 if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
3052 if (_cuJacobianPoint.size()) {
3054 _num_imgpt, _num_camera, _num_point, E.
begin(), JtE.
begin(),
3055 _cuCameraData.begin(), _cuPointData.begin(), _cuMeasurements.begin(),
3056 &_cuProjectionMap.front(), &_cuCameraMeasurementMap.front(),
3057 &_cuCameraMeasurementList.front(), &_cuPointMeasurementMap.front(),
3058 _cuJacobianPoint.begin(), __fixed_intrinsics, __use_radial_distortion,
3059 mode, __num_cpu_thread[FUNC_JTE_]);
3061 if (_cuVectorSJ.size() && mode != 2)
3065 JtE.
begin(), _cuCameraData.begin(),
3066 _cuPointData.begin(), _cuMeasurements.begin(),
3067 &_cuProjectionMap.front(), __fixed_intrinsics,
3068 __use_radial_distortion, mode);
3072 if (!_cuVectorSJ.size()) {
3073 }
else if (mode == 2)
3077 ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
3081 }
else if (__jc_store_transpose) {
3083 _num_camera, _num_point, E.
begin(), _cuJacobianCameraT.begin(),
3084 &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3085 _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.
begin(),
3086 true, mode, __num_cpu_thread[FUNC_JTEC_JCT],
3087 __num_cpu_thread[FUNC_JTEP]);
3090 _num_camera, _num_point, E.
begin(), _cuJacobianCamera.begin(),
3091 &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3092 _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.
begin(),
3093 false, mode, __num_cpu_thread[FUNC_JTEC_JCO],
3094 __num_cpu_thread[FUNC_JTEP]);
3097 if (mode != 2 && _num_imgpt_q > 0) {
3099 &_cuCameraQList.front(), _cuCameraQListW.begin(),
3100 _cuVectorSJ.begin(), JtE.
begin());
3104 template <
class Float>
3106 float damping,
float& g_norm,
3116 template <
class Float>
3119 ++__num_projection_eval;
3122 _cuMeasurements.begin(), &_cuProjectionMap.front(),
3123 proj.
begin(), __use_radial_distortion,
3124 __num_cpu_thread[FUNC_PJ]);
3125 if (_num_imgpt_q > 0)
3127 _cuCameraQMapW.begin(), proj.
begin() + 2 * _num_imgpt);
3131 template <
class Float>
3134 ++__num_projection_eval;
3137 _cuMeasurements.begin(), &_cuProjectionMap.front(),
3138 proj.
begin(), __use_radial_distortion,
3139 __num_cpu_thread[FUNC_PJ]);
3140 if (_num_imgpt_q > 0)
3142 _cuCameraQMapW.begin(), proj.
begin() + 2 * _num_imgpt);
3146 template <
class Float>
3149 if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) ||
3150 !__jc_store_original) {
3152 _num_imgpt, _num_camera,
X.begin(), JX.
begin(), _cuCameraData.begin(),
3153 _cuPointData.begin(), _cuMeasurements.begin(), _cuVectorSJ.begin(),
3154 &_cuProjectionMap.front(), __fixed_intrinsics, __use_radial_distortion,
3155 mode, __num_cpu_thread[FUNC_JX_]);
3158 _cuJacobianCamera.begin(), _cuJacobianPoint.begin(),
3159 &_cuProjectionMap.front(), JX.
begin(), mode,
3160 __num_cpu_thread[FUNC_JX]);
3163 if (_num_imgpt_q > 0 && mode != 2) {
3165 _cuCameraQMapW.begin(), _cuVectorSJ.begin(),
3166 JX.
begin() + 2 * _num_imgpt);
3170 template <
class Float>
3174 if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose &&
3175 __bundle_current_mode != 2)) {
3177 lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements,
3178 _cuProjectionMap, _cuVectorSJ, _cuCameraQListW, _cuVectorJJ, _cuBlockPC,
3179 __fixed_intrinsics, __use_radial_distortion, __bundle_current_mode);
3180 }
else if (__jc_store_transpose) {
3182 _num_camera, _num_point, lambda, dampd, _cuJacobianCameraT.begin(),
3183 &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(),
3184 &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3185 _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(),
3186 _cuBlockPC.begin(), __use_radial_distortion,
true,
3187 __num_cpu_thread[FUNC_BCC_JCT], __num_cpu_thread[FUNC_BCP],
3188 __bundle_current_mode);
3191 _num_camera, _num_point, lambda, dampd, _cuJacobianCamera.begin(),
3192 &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(),
3193 &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3194 _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(),
3195 _cuBlockPC.begin(), __use_radial_distortion,
false,
3196 __num_cpu_thread[FUNC_BCC_JCO], __num_cpu_thread[FUNC_BCP],
3197 __bundle_current_mode);
3201 template <
class Float>
3205 v.
begin(), pv.
begin(), __use_radial_distortion, mode,
3206 __num_cpu_thread[FUNC_MPC],
3207 __num_cpu_thread[FUNC_MPP]);
3210 template <
class Float>
3213 if (__no_jacobian_store) {
3214 }
else if (__jc_store_transpose) {
3216 _cuJacobianCameraT, _cuCameraMeasurementMap, _cuJacobianPoint,
3217 _cuPointMeasurementMap, _cuCameraMeasurementList,
3218 _cuCameraQListW.begin(), JJ,
true, __use_radial_distortion);
3219 }
else if (__jc_store_original) {
3221 _cuJacobianCamera, _cuCameraMeasurementMap, _cuJacobianPoint,
3222 _cuPointMeasurementMap, _cuCameraMeasurementList,
3223 _cuCameraQListW.begin(), JJ,
false, __use_radial_distortion);
3227 template <
class Float>
3229 int incompatible_radial_distortion = 0;
3230 _cuMeasurements.resize(_num_imgpt * 2);
3231 if (__focal_normalize) {
3232 if (__focal_scaling == 1.0f) {
3235 vector<float> focals(_num_camera);
3236 for (
int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f;
3237 std::nth_element(focals.begin(), focals.begin() + _num_camera / 2,
3239 float median_focal_length = focals[_num_camera / 2];
3240 __focal_scaling = __data_normalize_median / median_focal_length;
3241 Float radial_factor = median_focal_length * median_focal_length * 4.0f;
3245 for (
int i = 0; i < _num_imgpt * 2; ++i) {
3246 _cuMeasurements[i] = Float(_imgpt_data[i] * __focal_scaling);
3248 for (
int i = 0; i < _num_camera; ++i) {
3249 _camera_data[i].f *= __focal_scaling;
3250 if (!__use_radial_distortion) {
3251 }
else if (__reset_initial_distortion) {
3252 _camera_data[i].radial = 0;
3253 }
else if (_camera_data[i].distortion_type != __use_radial_distortion) {
3254 incompatible_radial_distortion++;
3255 _camera_data[i].radial = 0;
3256 }
else if (__use_radial_distortion == -1) {
3257 _camera_data[i].radial *= radial_factor;
3260 if (__verbose_level > 2)
3261 std::cout <<
"Focal length normalized by " << __focal_scaling <<
'\n';
3262 __reset_initial_distortion =
false;
3265 if (__use_radial_distortion) {
3266 for (
int i = 0; i < _num_camera; ++i) {
3267 if (__reset_initial_distortion) {
3268 _camera_data[i].radial = 0;
3269 }
else if (_camera_data[i].distortion_type != __use_radial_distortion) {
3270 _camera_data[i].radial = 0;
3271 incompatible_radial_distortion++;
3274 __reset_initial_distortion =
false;
3276 std::copy(_imgpt_data, _imgpt_data + _cuMeasurements.size(),
3277 _cuMeasurements.begin());
3280 if (incompatible_radial_distortion) {
3281 std::cout <<
"WARNING: incompatible radial distortion input; reset to 0;\n";
3285 template <
class Float>
3287 if (__depth_scaling == 1.0f) {
3288 const float dist_bound = 1.0f;
3289 vector<float> oz(_num_imgpt);
3290 vector<float> cpdist1(_num_camera, dist_bound);
3291 vector<float> cpdist2(_num_camera, -dist_bound);
3292 vector<int> camnpj(_num_camera, 0), cambpj(_num_camera, 0);
3293 int bad_point_count = 0;
3294 for (
int i = 0; i < _num_imgpt; ++i) {
3295 int cmidx = _camera_idx[i];
3296 CameraT* cam = _camera_data + cmidx;
3297 float* rz = cam->
m[2];
3298 float* x = _point_data + 4 * _point_idx[i];
3299 oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->
t[2]);
3303 float ozr = oz[i] / cam->
t[2];
3304 if (
fabs(ozr) < __depth_check_epsilon) {
3306 float px = cam->
f * (cam->
m[0][0] * x[0] + cam->
m[0][1] * x[1] +
3307 cam->
m[0][2] * x[2] + cam->
t[0]);
3308 float py = cam->
f * (cam->
m[1][0] * x[0] + cam->
m[1][1] * x[1] +
3309 cam->
m[1][2] * x[2] + cam->
t[1]);
3310 float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1];
3312 if ((checkx && px * oz[i] * mx < 0 &&
fabs(mx) > 64) ||
3313 (!checkx &&
py * oz[i] * my < 0 &&
fabs(my) > 64)) {
3314 if (__verbose_level > 3)
3315 std::cout <<
"Warning: proj of #" << cmidx
3316 <<
" on the wrong side, oz = " << oz[i] <<
" ("
3317 << (px / oz[i]) <<
',' << (
py / oz[i]) <<
") (" << mx
3318 <<
',' << my <<
")\n";
3326 cpdist1[cmidx] =
std::min(cpdist1[cmidx], oz[i]);
3328 cpdist2[cmidx] =
std::max(cpdist2[cmidx], oz[i]);
3331 __num_point_behind++;
3336 if (bad_point_count > 0 && __depth_degeneracy_fix) {
3337 if (!__focal_normalize || !__depth_normalize)
3338 std::cout <<
"Enable data normalization on degeneracy\n";
3339 __focal_normalize =
true;
3340 __depth_normalize =
true;
3342 if (__depth_normalize) {
3343 std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end());
3344 float oz_median = oz[_num_imgpt / 2];
3345 float shift_min =
std::min(oz_median * 0.001f, 1.0f);
3346 float dist_threshold = shift_min * 0.1f;
3347 __depth_scaling = (1.0 / oz_median) / __data_normalize_median;
3348 if (__verbose_level > 2)
3349 std::cout <<
"Depth normalized by " << __depth_scaling <<
" ("
3350 << oz_median <<
")\n";
3352 for (
int i = 0; i < _num_camera; ++i) {
3354 if (!__depth_degeneracy_fix) {
3355 }
else if ((cpdist1[i] < dist_threshold ||
3356 cpdist2[i] > -dist_threshold)) {
3357 float shift_epsilon =
fabs(_camera_data[i].t[2] * FLT_EPSILON);
3358 float shift =
std::max(shift_min, shift_epsilon);
3360 cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold;
3361 _camera_data[i].t[2] += shift;
3362 if (__verbose_level > 3)
3363 std::cout <<
"Adjust C" << std::setw(5) << i <<
" by "
3364 << std::setw(12) << shift <<
" [B" << std::setw(2)
3365 << cambpj[i] <<
"/" << std::setw(5) << camnpj[i] <<
"] ["
3366 << (boths ?
'X' :
' ') <<
"][" << cpdist1[i] <<
", "
3367 << cpdist2[i] <<
"]\n";
3368 __num_camera_modified++;
3370 _camera_data[i].t[0] *= __depth_scaling;
3371 _camera_data[i].t[1] *= __depth_scaling;
3372 _camera_data[i].t[2] *= __depth_scaling;
3374 for (
int i = 0; i < _num_point; ++i) {
3376 _point_data[4 * i + 0] *= __depth_scaling;
3377 _point_data[4 * i + 1] *= __depth_scaling;
3378 _point_data[4 * i + 2] *= __depth_scaling;
3381 if (__num_point_behind > 0)
3382 std::cout <<
"WARNING: " << __num_point_behind
3383 <<
" points are behind cameras.\n";
3384 if (__num_camera_modified > 0)
3385 std::cout <<
"WARNING: " << __num_camera_modified
3386 <<
" camera moved to avoid degeneracy.\n";
3390 template <
class Float>
3392 if (__focal_normalize && __focal_scaling != 1.0f) {
3393 float squared_focal_factor = (__focal_scaling * __focal_scaling);
3394 for (
int i = 0; i < _num_camera; ++i) {
3395 _camera_data[i].f /= __focal_scaling;
3396 if (__use_radial_distortion == -1)
3397 _camera_data[i].radial *= squared_focal_factor;
3398 _camera_data[i].distortion_type = __use_radial_distortion;
3400 _projection_sse /= squared_focal_factor;
3401 __focal_scaling = 1.0f;
3402 }
else if (__use_radial_distortion) {
3403 for (
int i = 0; i < _num_camera; ++i)
3404 _camera_data[i].distortion_type = __use_radial_distortion;
3407 if (__depth_normalize && __depth_scaling != 1.0f) {
3408 for (
int i = 0; i < _num_camera; ++i) {
3409 _camera_data[i].t[0] /= __depth_scaling;
3410 _camera_data[i].t[1] /= __depth_scaling;
3411 _camera_data[i].t[2] /= __depth_scaling;
3413 for (
int i = 0; i < _num_point; ++i) {
3414 _point_data[4 * i + 0] /= __depth_scaling;
3415 _point_data[4 * i + 1] /= __depth_scaling;
3416 _point_data[4 * i + 2] /= __depth_scaling;
3418 __depth_scaling = 1.0f;
3422 template <
class Float>
3428 __recent_cg_status =
' ';
3431 int plen = GetParameterLength();
3433 VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ :
null;
3434 ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3446 r.
set(_cuVectorRK.data(), 8 * _num_camera);
3448 p.
set(_cuVectorPK.data(), 8 * _num_camera);
3450 z.
set(_cuVectorZK.data(), 8 * _num_camera);
3452 x.
set(_cuVectorXK.data(), 8 * _num_camera);
3454 d.
set(VectorDP.
data(), 8 * _num_camera);
3459 up.
set(u.
data() + 8 * _num_camera, 3 * _num_point);
3461 vp.
set(v.
data() + 8 * _num_camera, 3 * _num_point);
3463 uc.
set(z.
data(), 8 * _num_camera);
3468 ApplyBlockPC(_cuVectorJtE, u, 2);
3472 ApplyBlockPC(r, p, 1);
3477 ApplyBlockPC(u, v, 2);
3482 float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0);
3484 if (__verbose_cg_iteration)
3485 std::cout <<
" --0,\t alpha = " << alpha0
3486 <<
", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) <<
"\n";
3487 if (!std::isfinite(alpha0)) {
3491 __recent_cg_status =
'I';
3498 ComputeSAXPY(Float(-1.0f), e2, e, e, __num_cpu_thread[FUNC_VV]);
3504 float_t rtzk = rtz0, rtz_min = rtz0, betak;
3506 ++__num_cg_iteration;
3509 ApplyBlockPC(r, z, 1);
3516 if (rtz_ratio < __cg_norm_threshold) {
3517 if (__recent_cg_status ==
' ')
3518 __recent_cg_status = iteration <
std::min(10, __cg_min_iteration)
3521 if (iteration >= __cg_min_iteration)
break;
3524 betak = rtzk / rtzp;
3530 ApplyBlockPC(u, v, 2);
3537 float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk);
3540 if (__verbose_cg_iteration)
3541 std::cout <<
" --" << iteration <<
",\t alpha= " << alphak
3542 <<
", rtzk/rtz0 = " << rtz_ratio
3543 <<
", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) <<
"\n";
3546 if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
3547 __recent_cg_status =
'X';
3556 ++__num_cg_iteration;
3557 if (iteration >=
std::min(__cg_max_iteration, plen))
break;
3560 ComputeSAXPY((Float)-1.0f, e2, e, e, __num_cpu_thread[FUNC_VV]);
3569 jte_p.
set(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 3);
3571 ApplyBlockPC(v, _cuVectorXK, 2);
3575 template <
class Float>
3581 __recent_cg_status =
' ';
3584 int plen = GetParameterLength();
3586 VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ :
null;
3587 VectorF& VectorQK = _cuVectorZK;
3588 ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3591 ApplyBlockPC(_cuVectorJtE,
3599 _cuVectorJX, __num_cpu_thread[FUNC_VS]);
3602 float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0);
3604 if (__verbose_cg_iteration)
3605 std::cout <<
" --0,\t alpha = " << alpha0
3606 <<
", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) <<
"\n";
3607 if (!std::isfinite(alpha0)) {
3611 __recent_cg_status =
'I';
3621 ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK,
3625 (Float)-alpha0, VectorQK, _cuVectorJtE,
3628 float_t rtzk = rtz0, rtz_min = rtz0, betak;
3630 ++__num_cg_iteration;
3633 ApplyBlockPC(_cuVectorRK, _cuVectorZK);
3638 _cuVectorRK, _cuVectorZK);
3640 if (rtz_ratio < __cg_norm_threshold) {
3641 if (__recent_cg_status ==
' ')
3642 __recent_cg_status = iteration <
std::min(10, __cg_min_iteration)
3645 if (iteration >= __cg_min_iteration)
break;
3648 betak = rtzk / rtzp;
3657 _cuVectorJX, __num_cpu_thread[FUNC_VS]);
3659 _cuVectorPK, VectorDP);
3661 float_t alphak = rtzk / (qtqk + lambda * ptdpk);
3664 if (__verbose_cg_iteration)
3665 std::cout <<
" --" << iteration <<
",\t alpha= " << alphak
3666 <<
", rtzk/rtz0 = " << rtz_ratio
3667 <<
", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) <<
"\n";
3670 if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
3671 __recent_cg_status =
'X';
3681 ++__num_cg_iteration;
3682 if (iteration >=
std::min(__cg_max_iteration, plen))
break;
3684 if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) {
3688 ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK);
3689 ComputeSAXPY((Float)-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK);
3692 ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK,
3695 (Float)-alphak, VectorQK, _cuVectorRK,
3702 template <
class Float>
3704 if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3705 ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3706 ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1);
3708 }
else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3709 ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3710 ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2);
3714 return __cg_schur_complement ? SolveNormalEquationPCGX(lambda)
3715 : SolveNormalEquationPCGB(lambda);
3719 template <
class Float>
3722 ofstream jo(
"j.txt");
3723 int cn = __use_radial_distortion ? 8 : 7;
3724 int width = cn * _num_camera + 3 * _num_point;
3725 jo <<
"%%MatrixMarket matrix coordinate real general\n";
3726 jo << (_num_imgpt * 2) <<
" " <<
width <<
" " << (cn + 3) * _num_imgpt * 2
3728 for (
int i = 0; i < _num_imgpt; ++i) {
3729 int ci = _camera_idx[i];
3730 int pi = _point_idx[i];
3731 int row = i * 2 + 1;
3735 int idx2 = _num_camera * cn + 3 * pi;
3737 for (
int k = 0; k < 2; ++k, ++row) {
3738 for (
int j = 0; j < cn; ++j) {
3739 jo << row <<
" " << (idx1 + j + 1) <<
" 1\n";
3741 for (
int j = 0; j < 3; ++j) {
3742 jo << row <<
" " << (idx2 + j + 1) <<
" 1\n";
3747 ofstream jt(
"jt.txt");
3748 jt <<
"%%MatrixMarket matrix coordinate real general\n";
3749 jt <<
width <<
" " << (_num_imgpt * 2) <<
" " << (cn + 3) * _num_imgpt * 2
3752 int* lisc = &_cuCameraMeasurementList[0];
3753 int* mapc = &_cuCameraMeasurementMap[0];
3754 int* mapp = &_cuPointMeasurementMap[0];
3756 for (
int i = 0; i < _num_camera; ++i) {
3758 int c1 = mapc[i + 1];
3759 for (
int k = 0; k < cn; ++k) {
3760 int row = i * cn + k + 1;
3761 for (
int j = c0; j < c1; ++j)
3762 jt << row <<
" " << (lisc[j] * 2 + 1) <<
" 1\n" << row <<
" "
3763 << (2 * lisc[j] + 2) <<
" 1\n";
3767 for (
int i = 0; i < _num_point; ++i) {
3769 int p1 = mapp[i + 1];
3770 for (
int k = 0; k < 3; ++k) {
3771 int row = i * 3 + _num_camera * cn + k + 1;
3772 for (
int j = p0; j < p1; ++j)
3773 jt << row <<
" " << (2 * j + 1) <<
" 1\n" << row <<
" " << (2 * j + 2)
3780 template <
class Float>
3782 EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
3783 EvaluateJacobians();
3786 SolveNormalEquationPCGX(__lm_initial_damp);
3788 SolveNormalEquationPCGB(__lm_initial_damp);
3795 template <
class Float>
3800 if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3801 if (__jacobian_normalize)
3802 ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera);
3804 _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
3805 _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
3806 return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj);
3807 }
else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3808 if (__jacobian_normalize)
3812 _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
3813 _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
3814 return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj);
3816 if (__jacobian_normalize)
ComputeVXY(_cuVectorXK, _cuVectorSJ, dx);
3818 _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
3819 _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
3820 return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj);
3824 template <
class Float>
3828 float expected_reduction;
3829 if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3831 xk.
set(_cuVectorXK.data(), 8 * _num_camera);
3833 jte.
set(_cuVectorJtE.data(), 8 * _num_camera);
3835 if (__lm_use_diagonal_damp) {
3837 jj.
set(_cuVectorJJ.data(), 8 * _num_camera);
3839 expected_reduction = damping * dq + dxtg;
3841 expected_reduction = damping * dx_sqnorm + dxtg;
3843 _cuCameraData.swap(_cuCameraDataEX);
3844 }
else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3846 xk.
set(_cuVectorXK.data() + 8 * _num_camera,
POINT_ALIGN * _num_point);
3848 jte.
set(_cuVectorJtE.data() + 8 * _num_camera,
POINT_ALIGN * _num_point);
3850 if (__lm_use_diagonal_damp) {
3852 jj.
set(_cuVectorJJ.data() + 8 * _num_camera,
POINT_ALIGN * _num_point);
3854 expected_reduction = damping * dq + dxtg;
3856 expected_reduction = damping * dx_sqnorm + dxtg;
3858 _cuPointData.swap(_cuPointDataEX);
3861 if (__accurate_gain_ratio) {
3865 expected_reduction = 2.0f * dxtg - njx;
3868 if (expected_reduction <= 0)
3869 expected_reduction = 0.001f * residual_reduction;
3870 }
else if (__lm_use_diagonal_damp) {
3872 expected_reduction = damping * dq + dxtg;
3874 expected_reduction = damping * dx_sqnorm + dxtg;
3877 _cuCameraData.swap(_cuCameraDataEX);
3878 _cuPointData.swap(_cuPointDataEX);
3881 return float(residual_reduction / expected_reduction);
3884 template <
class Float>
3886 if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3887 _cuJacobianPoint.resize(0);
3888 }
else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3889 _cuJacobianCamera.resize(0);
3890 _cuJacobianCameraT.resize(0);
3894 template <
class Float>
3896 if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3898 temp.
set(_cuVectorXK.data(), 8 * _num_camera);
3900 }
else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3902 temp.
set(_cuVectorXK.data() + 8 * _num_camera,
POINT_ALIGN * _num_point);
3909 template <
class Float>
3915 float mse_convert_ratio =
3916 1.0f / (_num_imgpt * __focal_scaling * __focal_scaling);
3917 float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f;
3918 const int edwidth = __verbose_sse ? 12 : 8;
3920 EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
3921 __initial_mse = __final_mse = _projection_sse * mse_convert_ratio;
3924 if (__jacobian_normalize) PrepareJacobianNormalization();
3927 EvaluateJacobians();
3930 if (__verbose_level)
3931 std::cout <<
"Initial " << (__verbose_sse ?
"sumed" :
"mean")
3932 <<
" squared error = " << __initial_mse * error_display_ratio
3933 <<
"\n----------------------------------------------\n";
3936 VectorF& cuImageTempProj = _cuVectorJX;
3938 VectorF& cuVectorDX = _cuVectorSJ.
size() ? _cuVectorZK : _cuVectorXK;
3941 float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf;
3942 SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm,
3946 std::cout << std::left;
3947 for (
int i = 0; i < __lm_max_iteration && !__abort_flag;
3948 __current_iteration = (++i)) {
3950 int num_cg_iteration = SolveNormalEquation(damping);
3953 if (num_cg_iteration == 0) {
3954 if (__verbose_level)
3955 std::cout <<
"#" << std::setw(3) << i <<
" quit on numeric errors\n";
3956 __pba_return_code =
'E';
3961 if (__recent_cg_status ==
'I') {
3962 std::cout <<
"#" << std::setw(3) << i <<
" 0 I e=" << std::setw(edwidth)
3964 <<
" u=" << std::setprecision(3) << std::setw(9) << damping
3965 <<
'\n' << std::setprecision(6);
3967 damping = damping * damping_adjust;
3968 damping_adjust = 2.0f * damping_adjust;
3974 ++__num_lm_iteration;
3977 float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm);
3981 if (dx_norm <= __lm_delta_threshold) {
3983 if (__verbose_level > 1)
3984 std::cout <<
"#" << std::setw(3) << i <<
" " << std::setw(3)
3985 << num_cg_iteration << char(__recent_cg_status)
3986 <<
" quit on too small change (" << dx_norm <<
" < "
3987 << __lm_delta_threshold <<
")\n";
3988 __pba_return_code =
'S';
3994 float average_residual = new_residual * mse_convert_ratio;
3995 float residual_reduction = _projection_sse - new_residual;
3998 if (std::isfinite(new_residual) && residual_reduction > 0) {
4000 float relative_reduction = 1.0f - (new_residual / _projection_sse);
4004 _projection_sse = new_residual;
4005 _cuImageProj.swap(cuImageTempProj);
4009 SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping);
4012 SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping,
4016 if (__verbose_level > 1)
4017 std::cout <<
"#" << std::setw(3) << i <<
" " << std::setw(3)
4018 << num_cg_iteration << char(__recent_cg_status)
4019 <<
" e=" << std::setw(edwidth)
4020 << average_residual * error_display_ratio
4021 <<
" u=" << std::setprecision(3) << std::setw(9) << damping
4022 <<
" r=" << std::setw(6)
4023 <<
floor(gain_ratio * 1000.f) * 0.001f
4024 <<
" g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm <<
" "
4025 << std::setw(9) << relative_reduction <<
' ' << std::setw(9)
4026 << dx_norm <<
" t=" << int(BundleTimerGetNow()) <<
"\n"
4027 << std::setprecision(6);
4030 if (!IsTimeBudgetAvailable()) {
4031 if (__verbose_level > 1)
4032 std::cout <<
"#" << std::setw(3) << i <<
" used up time budget.\n";
4033 __pba_return_code =
'T';
4035 }
else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) {
4036 if (__verbose_level > 1)
4037 std::cout <<
"#" << std::setw(3) << i
4038 <<
" converged with small gradient\n";
4039 __pba_return_code =
'G';
4041 }
else if (average_residual * error_display_ratio <= __lm_mse_threshold) {
4042 if (__verbose_level > 1)
4043 std::cout <<
"#" << std::setw(3) << i <<
" satisfies MSE threshold\n";
4044 __pba_return_code =
'M';
4048 float temp = gain_ratio * 2.0f - 1.0f;
4049 float adaptive_adjust = 1.0f - temp * temp * temp;
4050 float auto_adjust =
std::max(1.0f / 3.0f, adaptive_adjust);
4053 damping = damping * auto_adjust;
4054 damping_adjust = 2.0f;
4055 if (damping < __lm_minimum_damp)
4056 damping = __lm_minimum_damp;
4057 else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp &&
4058 __lm_use_diagonal_damp)
4059 damping = __lm_maximum_damp;
4061 EvaluateJacobians();
4065 if (__verbose_level > 1)
4066 std::cout <<
"#" << std::setw(3) << i <<
" " << std::setw(3)
4067 << num_cg_iteration << char(__recent_cg_status)
4068 <<
" e=" << std::setw(edwidth) << std::left
4069 << average_residual * error_display_ratio
4070 <<
" u=" << std::setprecision(3) << std::setw(9) << damping
4071 <<
" r=----- " << (__lm_check_gradient || __save_gradient_norm
4074 <<
" --------- " << std::setw(9) << dx_norm
4075 <<
" t=" << int(BundleTimerGetNow()) <<
"\n"
4076 << std::setprecision(6);
4078 if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp &&
4079 damping > __lm_damping_auto_switch) {
4080 __lm_use_diagonal_damp =
false;
4081 damping = __lm_damping_auto_switch;
4082 damping_adjust = 2.0f;
4083 if (__verbose_level > 1)
4084 std::cout <<
"NOTE: switch to damping with an identity matix\n";
4087 damping = damping * damping_adjust;
4088 damping_adjust = 2.0f * damping_adjust;
4092 if (__verbose_level == 1) std::cout <<
'.';
4095 __final_mse = float(_projection_sse * mse_convert_ratio);
4097 __use_radial_distortion
4098 ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) *
4103 #define PROFILE_REPORT2(A, T) \
4104 std::cout << std::setw(24) << A << ": " << (T) << "\n";
4106 #define PROFILE_REPORT(A) \
4107 std::cout << std::setw(24) << A << ": " \
4108 << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n";
4110 #define PROFILE_(B) \
4111 BundleTimerStart(TIMER_PROFILE_STEP); \
4112 for (int i = 0; i < repeat; ++i) { \
4115 BundleTimerSwitch(TIMER_PROFILE_STEP);
4117 #define PROFILE(A, B) PROFILE_(A B) PROFILE_REPORT(#A)
4118 #define PROXILE(A, B) PROFILE_(B) PROFILE_REPORT(A)
4119 #define PROTILE(FID, A, B) \
4121 float tbest = FLT_MAX; \
4123 int nto = nthread[FID]; \
4125 std::ostringstream os1; \
4126 os1 << #A "(" << nto << ")"; \
4127 PROXILE(os1.str(), A B); \
4129 for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) { \
4132 float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
4134 if (j >= max(nto, 16)) break; \
4140 if (nto != 0) nthread[FID] = nbest; \
4142 std::ostringstream os; \
4143 os << #A "(" << nbest << ")"; \
4144 PROFILE_REPORT2(os.str(), tbest); \
4148 #define PROTILE2(FID1, FID2, A, B) \
4150 int nt1 = nthread[FID1], nt2 = nthread[FID2]; \
4152 std::ostringstream os1; \
4153 os1 << #A "(" << nt1 << "," << nt2 << ")"; \
4154 PROXILE(os1.str(), A B); \
4156 float tbest = FLT_MAX; \
4157 int nbest1 = 1, nbest2 = 1; \
4158 nthread[FID2] = 1; \
4159 for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) { \
4160 nthread[FID1] = j; \
4162 float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
4164 if (j >= max(nt1, 16)) break; \
4170 nthread[FID1] = nbest1; \
4171 for (int j = 2; j <= THREAD_NUM_MAX; j *= 2) { \
4172 nthread[FID2] = j; \
4174 float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
4176 if (j >= max(nt2, 16)) break; \
4182 nthread[FID2] = nbest2; \
4184 std::ostringstream os; \
4185 os << #A "(" << nbest1 << "," << nbest2 << ")"; \
4186 PROFILE_REPORT2(os.str(), tbest); \
4188 if (nt1 == 0) nthread[FID1] = 0; \
4189 if (nt2 == 0) nthread[FID2] = 0; \
4192 template <
class Float>
4194 const int repeat =
std::max(__profile_pba, 1);
4195 int* nthread = __num_cpu_thread;
4196 std::cout <<
"---------------------------------\n"
4197 "| Run profiling steps ("
4198 << repeat <<
") |\n"
4199 "---------------------------------\n"
4204 EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
4205 if (__jacobian_normalize) PrepareJacobianNormalization();
4206 EvaluateJacobians();
4208 ComputeBlockPC(__lm_initial_damp,
true);
4211 if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 &&
4212 SolveNormalEquationPCGB(__lm_initial_damp) == 10)
4214 __lm_initial_damp *= 2.0f;
4215 }
while (__lm_initial_damp < 1024.0f);
4216 std::cout <<
"damping set to " << __lm_initial_damp <<
" for profiling\n"
4217 <<
"---------------------------------\n";
4220 int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration;
4221 __cg_max_iteration = __cg_min_iteration = 10;
4222 __num_cg_iteration = 0;
4223 PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp));
4224 if (__num_cg_iteration != 100)
4225 std::cout << __num_cg_iteration <<
" cg iterations in all\n";
4227 __num_cg_iteration = 0;
4228 PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp));
4229 if (__num_cg_iteration != 100)
4230 std::cout << __num_cg_iteration <<
" cg iterations in all\n";
4231 std::cout <<
"---------------------------------\n";
4233 __num_cg_iteration = 0;
4234 PROXILE(
"Single iteration LMX", RunTestIterationLM(
true));
4235 if (__num_cg_iteration != 100)
4236 std::cout << __num_cg_iteration <<
" cg iterations in all\n";
4238 __num_cg_iteration = 0;
4239 PROXILE(
"Single iteration LMB", RunTestIterationLM(
false));
4240 if (__num_cg_iteration != 100)
4241 std::cout << __num_cg_iteration <<
" cg iterations in all\n";
4242 std::cout <<
"---------------------------------\n";
4243 __cg_max_iteration = cgmax;
4244 __cg_min_iteration = cgmin;
4254 ((Float)0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK));
4255 std::cout <<
"---------------------------------\n";
4257 (_cuImageProj, nthread[FUNC_VS]));
4261 avec<Float> temp1(_cuImageProj.size()), temp2(_cuImageProj.size());
4264 ((Float)0.01f, _cuImageProj, temp1, temp2, nthread[FUNC_VV]));
4267 std::cout <<
"---------------------------------\n";
4268 __multiply_jx_usenoj =
false;
4271 PROTILE(FUNC_PJ, EvaluateProjection,
4272 (_cuCameraData, _cuPointData, _cuImageProj));
4273 PROTILE2(FUNC_MPC, FUNC_MPP, ApplyBlockPC, (_cuVectorJtE, _cuVectorPK));
4276 if (!__no_jacobian_store) {
4277 if (__jc_store_original) {
4280 if (__jc_store_transpose) {
4281 PROTILE(FUNC_JJ_JCO_JCT_JP, EvaluateJacobians, ());
4283 (_cuImageProj, _cuVectorJtE));
4284 PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f,
true));
4287 std::cout <<
"---------------------------------\n"
4288 "| Not storing original JC | \n"
4289 "---------------------------------\n";
4290 __jc_store_original =
false;
4291 PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ());
4292 __jc_store_original =
true;
4296 std::cout <<
"---------------------------------\n"
4297 "| Not storing transpose JC | \n"
4298 "---------------------------------\n";
4299 __jc_store_transpose =
false;
4300 _cuJacobianCameraT.resize(0);
4301 PROTILE(FUNC_JJ_JCO_JP, EvaluateJacobians, ());
4303 (_cuImageProj, _cuVectorJtE));
4304 PROTILE2(FUNC_BCC_JCO, FUNC_BCP, ComputeBlockPC, (0.001f,
true));
4306 }
else if (__jc_store_transpose) {
4308 (_cuImageProj, _cuVectorJtE));
4309 PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f,
true));
4312 std::cout <<
"---------------------------------\n"
4313 "| Not storing original JC | \n"
4314 "---------------------------------\n";
4315 PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ());
4319 if (!__no_jacobian_store) {
4320 std::cout <<
"---------------------------------\n"
4321 "| Not storing Camera Jacobians | \n"
4322 "---------------------------------\n";
4323 __jc_store_transpose =
false;
4324 __jc_store_original =
false;
4325 _cuJacobianCamera.resize(0);
4326 _cuJacobianCameraT.resize(0);
4327 PROTILE(FUNC_JJ_JP, EvaluateJacobians, ());
4333 std::cout <<
"---------------------------------\n"
4334 "| Not storing any jacobians |\n"
4335 "---------------------------------\n";
4336 __no_jacobian_store =
true;
4337 _cuJacobianPoint.resize(0);
4340 PROFILE(ComputeBlockPC, (0.001f,
true));
4341 std::cout <<
"---------------------------------\n";
4344 template <
class Float>
4347 #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
4348 SYSTEM_INFO sysinfo;
4349 GetNativeSystemInfo(&sysinfo);
4351 SYSTEM_INFO sysinfo;
4352 GetSystemInfo(&sysinfo);
4354 return sysinfo.dwNumberOfProcessors;
4356 return sysconf(_SC_NPROCESSORS_ONLN);
4361 #ifndef SIMD_NO_DOUBLE
#define PROTILE(FID, A, B)
#define DEFINE_THREAD_DATA(X)
#define ALLOCATE_REQUIRED_DATA(NAME, num, channels)
#define PROTILE2(FID1, FID2, A, B)
#define END_THREAD_RPOC(X)
#define BEGIN_THREAD_PROC(X)
#define WAIT_THREAD(tv, n)
#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option)
#define RUN_THREAD(X, t,...)
void SaveBundleRecord(int iter, float res, float damping, float gn, float gi)
int __num_cpu_thread[NUM_FUNC]
bool __multiply_jx_usenoj
bool InitializeStorageForCG()
void SaveBundleRecord(int iter, float res, float damping, float &g_norm, float &g_inf)
bool InitializeStorageForSFM()
virtual void SetProjection(size_t nproj, const Point2D *imgpts, const int *point_idx, const int *cam_idx)
void ComputeJtE(VectorF &E, VectorF &JtE, int mode=0)
bool ProcessIndexCameraQ(std::vector< int > &qmap, std::vector< int > &qlist)
void RunTestIterationLM(bool reduced)
int SolveNormalEquationPCGX(float lambda)
float EvaluateProjectionX(VectorF &cam, VectorF &point, VectorF &proj)
virtual int RunBundleAdjustment()
float EvaluateDeltaNorm()
void PrepareJacobianNormalization()
void NonlinearOptimizeLM()
int SolveNormalEquationPCGB(float lambda)
float SaveUpdatedSystem(float residual_reduction, float dx_sqnorm, float damping)
virtual void SetCameraData(size_t ncam, CameraT *cams)
void ProcessWeightCameraQ(std::vector< int > &cpnum, std::vector< int > &qmap, Float *qmapw, Float *qlistw)
void ComputeBlockPC(float lambda, bool dampd)
virtual float GetMeanSquaredError()
virtual void SetPointData(size_t npoint, Point3D *pts)
void ComputeDiagonal(VectorF &JJI)
virtual void SetFocalMask(const int *fmask, float weight)
void ComputeJX(VectorF &X, VectorF &JX, int mode=0)
int SolveNormalEquation(float lambda)
void ApplyBlockPC(VectorF &v, VectorF &pv, int mode=0)
float UpdateCameraPoint(VectorF &dx, VectorF &cuImageTempProj)
void AdjustBundleAdjsutmentMode()
float EvaluateProjection(VectorF &cam, VectorF &point, VectorF &proj)
void TransferDataToHost()
void SaveToFile(const char *name)
void swap(avec< Float > &next)
void set(Float *data, size_t count)
__host__ __device__ float2 fabs(float2 v)
static void error(char *msg)
MiniVec< float, N > floor(const MiniVec< float, N > &a)
double ComputeVectorDot(const avec< Float > &vec1, const avec< Float > &vec2)
void ComputeJX_(size_t nproj, size_t ncam, const Float *x, Float *jx, const Float *camera, const Float *point, const Float *ms, const Float *sj, const int *jmap, bool intrinsic_fixed, int radial_distortion, int mode, int mt=16)
void ComputeProjection(size_t nproj, const Float *camera, const Float *point, const Float *ms, const int *jmap, Float *pj, int radial, int mt)
void ComputeVectorNorm(const Float *p, const Float *pe, double *psum)
void ComputeProjectionX(size_t nproj, const Float *camera, const Float *point, const Float *ms, const int *jmap, Float *pj, int radial, int mt)
void AddScaledVec8(Float a, const Float *x, Float *v)
void ComputeVXY(const avec< Float > &vec1, const avec< Float > &vec2, avec< Float > &result, size_t part=0, size_t skip=0)
void ComputeSAXPY(Float a, const avec< Float > &vec1, const avec< Float > &vec2, avec< Float > &result, int mt=0)
void MemoryCopyA(const Float *p, const Float *pe, Float *d)
void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2, const Float *jc, const int *cmap, const int *cmlist, Float *di, Float *bi, int vn, bool jc_transpose, bool use_jq, int mt)
void ComputeSAX(Float a, const avec< Float > &vec1, avec< Float > &result)
double ComputeVectorNorm(const avec< Float > &vec, int mt=0)
void MultiplyBlockConditionerP(int npoint, const Float *bi, const Float *x, Float *vx, int mt=0)
void ComputeDiagonalBlock(size_t ncam, size_t npts, float lambda, bool dampd, const Float *jc, const int *cmap, const Float *jp, const int *pmap, const int *cmlist, const Float *sj, const Float *wq, Float *diag, Float *blocks, int radial_distortion, bool jc_transpose, int mt1=2, int mt2=2, int mode=0)
void MemoryCopyB(const Float *p, const Float *pe, Float *d)
void MultiplyBlockConditioner(int ncam, int npoint, const Float *blocksv, const Float *vec, Float *resultv, int radial, int mode, int mt1, int mt2)
void ComputeDiagonalAddQ(size_t ncam, const Float *qw, Float *d, const Float *sj=NULL)
Float DotProduct8(const Float *v1, const Float *v2)
void SetVectorZero(Float *p, Float *pe)
Float ComputeVectorMax(const avec< Float > &vec)
void ComputeJtEC(size_t ncam, const Float *pe, const Float *jc, const int *cmap, const int *cmlist, Float *v, bool jc_transpose, int mt)
void ComputeJQtEC(size_t ncam, const Float *pe, const int *qlist, const Float *wq, const Float *sj, Float *v)
void ComputeJX(size_t nproj, size_t ncam, const Float *x, const Float *jc, const Float *jp, const int *jmap, Float *jx, int mode, int mt=2)
double ComputeVectorNormW(const avec< Float > &vec, const avec< Float > &weight)
void MultiplyBlockConditionerC(int ncam, const Float *bi, const Float *x, Float *vx, int vn, int mt=0)
void ComputeSQRT(avec< Float > &vec)
void JacobianOne(const Float *c, const Float *pt, const Float *ms, Float *jxc, Float *jyc, Float *jxp, Float *jyp, bool intrinsic_fixed, int radial_distortion)
void ComputeJacobian(size_t nproj, size_t ncam, const Float *camera, const Float *point, Float *jc, Float *jp, const int *jmap, const Float *sj, const Float *ms, const int *cmlist, bool intrinsic_fixed, int radial_distortion, bool shuffle, Float *jct, int mt=2, int i0=0)
void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float *ee, Float *jte, const Float *camera, const Float *point, const Float *ms, const int *jmap, const int *cmap, const int *cmlist, const int *pmap, const Float *jp, bool intrinsic_fixed, int radial_distortion, int mode, int mt)
void ComputeTwoJX(const Float *jc, const Float *jp, const Float *xc, const Float *xp, Float *jx)
void ComputeJtEP(size_t npt, const Float *pe, const Float *jp, const int *pmap, Float *v, int mt)
void AddBlockJtJ(const Float *jc, Float *block, int vn)
void ComputeSXYPZ(Float a, const avec< Float > &vec1, const avec< Float > &vec2, const avec< Float > &vec3, avec< Float > &result)
void ComputeDiagonal(const avec< Float > &jcv, const vector< int > &cmapv, const avec< Float > &jpv, const vector< int > &pmapv, const vector< int > &cmlistv, const Float *qw0, avec< Float > &jtjdi, bool jc_transpose, int radial)
void ComputeRSQRT(avec< Float > &vec)
void ComputeJQX(size_t nq, const Float *x, const int *qmap, const Float *wq, const Float *sj, Float *jx)
void ComputeProjectionQ(size_t nq, const Float *camera, const int *qmap, const Float *wq, Float *pj)
void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2, const Float *jp, const int *pmap, Float *di, Float *bi, int mt)
void InvertSymmetricMatrix(T a[n][m], T ai[n][m])
void UpdateCamera(int ncam, const avec< Float > &camera, const avec< Float > &delta, avec< Float > &new_camera)
void UpdateCameraPoint(int ncam, const avec< Float > &camera, const avec< Float > &point, avec< Float > &delta, avec< Float > &new_camera, avec< Float > &new_point, int mode, int mt)
void ComputeJtEC_(size_t ncam, const Float *ee, Float *jte, const Float *c, const Float *point, const Float *ms, const int *jmap, const int *cmap, const int *cmlist, bool intrinsic_fixed, int radial_distortion, int mt)
void UncompressRodriguesRotation(const Float r[3], Float m[])
void GetRodriguesRotation(const Float m[3][3], Float r[3])
void ComputeJtE(size_t ncam, size_t npt, const Float *pe, const Float *jc, const int *cmap, const int *cmlist, const Float *jp, const int *pmap, Float *v, bool jc_transpose, int mode, int mt1, int mt2)
void ComputeDiagonalBlock_(float lambda, bool dampd, const avec< Float > &camerav, const avec< Float > &pointv, const avec< Float > &meas, const vector< int > &jmapv, const avec< Float > &sjv, avec< Float > &qwv, avec< Float > &diag, avec< Float > &blocks, bool intrinsic_fixed, int radial_distortion, int mode=0)
void ScaleJ8(Float *jcx, Float *jcy, const Float *sj)
void SetVectorZero(avec< Float > &vec)
void ComputeSAXPY(Float a, const Float *it1, const Float *it2, Float *it3, Float *ite)
void ComputeSXYPZ(Float a, const Float *p1, const Float *p2, const Float *p3, Float *p4, Float *pe)
ParallelBA * NewSparseBundleCPU(bool dp, const int num_threads)