ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
SparseBundleCPU.cpp
Go to the documentation of this file.
1 // File: SparseBundleCPU.cpp
3 // Author: Changchang Wu
4 // Description : implementation of the CPU-based multicore bundle adjustment
5 //
6 // Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu)
7 // and the University of Washington at Seattle
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU General Public
11 // License as published by the Free Software Foundation; either
12 // Version 3 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // General Public License for more details.
18 //
20 
21 #include <stdlib.h>
22 #include <vector>
23 #include <iostream>
24 #include <utility>
25 #include <algorithm>
26 #include <fstream>
27 #include <sstream>
28 #include <iomanip>
29 #include <cmath>
30 
31 using std::vector;
32 using std::cout;
33 using std::pair;
34 using std::ofstream;
35 using std::max;
36 
37 #include <math.h>
38 #include <time.h>
39 #include <float.h>
40 #include "pba.h"
41 #include "SparseBundleCPU.h"
42 
43 #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
44 #include <thread>
45 #endif
46 
47 //#define POINT_DATA_ALIGN4
48 #if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
49 #undef CPUPBA_USE_SSE
50 #undef CPUPBA_USE_AVX
51 #undef POINT_DATA_ALIGN4
52 #if defined(_M_ARM) && _M_ARM >= 7 && !defined(DISABLE_CPU_NEON)
53 #include <arm_neon.h>
54 #define CPUPBA_USE_NEON
55 #elif defined(__ARM_NEON) && !defined(DISABLE_CPU_NEON)
56 #include <arm_neon.h>
57 #define CPUPBA_USE_NEON
58 #endif
59 #elif defined(__AVX__) && !defined(DISABLE_CPU_AVX)
60 #include <immintrin.h>
61 #define CPUPBA_USE_AVX
62 #undef CPUPBA_USE_SSE
63 #undef POINT_DATA_ALIGN4
64 #elif (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2)) && !defined(DISABLE_CPU_SSE)
65 #define CPUPBA_USE_SSE
66 #include <xmmintrin.h>
67 #include <emmintrin.h>
68 #endif
69 
70 #ifdef POINT_DATA_ALIGN4
71 #define POINT_ALIGN 4
72 #else
73 #define POINT_ALIGN 3
74 #endif
75 
76 #define POINT_ALIGN2 (POINT_ALIGN * 2)
77 
78 #ifdef _WIN32
79 #define NOMINMAX
80 #include <windows.h>
81 #define INLINESUFIX
82 #define finite _finite
83 #else
84 #include <pthread.h>
85 #include <sched.h>
86 #include <unistd.h>
87 #endif
88 
89 // maximum thread count
90 #define THREAD_NUM_MAX 64
91 // compute the number of threads for vector operatoins, pure heuristics...
92 #define AUTO_MT_NUM(sz) \
93  int((log((double)sz) / log(2.0) - 18.5) * __num_cpu_cores / 16.0)
94 
95 namespace pba {
96 
97 template <class Float>
98 void avec<Float>::SaveToFile(const char* name) {
99  ofstream out(name);
100  for (Float* p = _data; p < _last; ++p) out << (*p) << '\n';
101 }
102 
103 #ifdef CPUPBA_USE_SSE
104 #define CPUPBA_USE_SIMD
105 namespace MYSSE {
106 template <class Float>
107 class SSE {};
108 template <>
109 class SSE<float> {
110  public:
111  typedef __m128 sse_type;
112  static inline sse_type zero() { return _mm_setzero_ps(); }
113 };
114 template <>
115 class SSE<double> {
116  public:
117  typedef __m128d sse_type;
118  static inline sse_type zero() { return _mm_setzero_pd(); }
119 };
120 
122 template <class Float>
123 inline size_t sse_step() {
124  return 16 / sizeof(Float);
125 };
126 inline __m128 sse_load1(const float* p) { return _mm_load1_ps(p); }
127 inline __m128 sse_load(const float* p) { return _mm_load_ps(p); }
128 inline __m128 sse_add(__m128 s1, __m128 s2) { return _mm_add_ps(s1, s2); }
129 inline __m128 sse_sub(__m128 s1, __m128 s2) { return _mm_sub_ps(s1, s2); }
130 inline __m128 sse_mul(__m128 s1, __m128 s2) { return _mm_mul_ps(s1, s2); }
131 inline __m128 sse_sqrt(__m128 s) { return _mm_sqrt_ps(s); }
132 
133 inline __m128d sse_load1(const double* p) { return _mm_load1_pd(p); }
134 inline __m128d sse_load(const double* p) { return _mm_load_pd(p); }
135 inline __m128d sse_add(__m128d s1, __m128d s2) { return _mm_add_pd(s1, s2); }
136 inline __m128d sse_sub(__m128d s1, __m128d s2) { return _mm_sub_pd(s1, s2); }
137 inline __m128d sse_mul(__m128d s1, __m128d s2) { return _mm_mul_pd(s1, s2); }
138 inline __m128d sse_sqrt(__m128d s) { return _mm_sqrt_pd(s); }
139 
140 #ifdef _WIN32
141 inline float sse_sum(__m128 s) {
142  return (s.m128_f32[0] + s.m128_f32[2]) + (s.m128_f32[1] + s.m128_f32[3]);
143 }
144 inline double sse_sum(__m128d s) { return s.m128d_f64[0] + s.m128d_f64[1]; }
145 #else
146 inline float sse_sum(__m128 s) {
147  float* f = (float*)(&s);
148  return (f[0] + f[2]) + (f[1] + f[3]);
149 }
150 inline double sse_sum(__m128d s) {
151  double* d = (double*)(&s);
152  return d[0] + d[1];
153 }
154 #endif
155 // inline float sse_dot(__m128 s1, __m128 s2) {__m128 temp = _mm_dp_ps(s1,
156 // s2, 0xF1); float* f = (float*) (&temp); return f[0]; }
157 // inline double sse_dot(__m128d s1, __m128d s2) {__m128d temp =
158 // _mm_dp_pd(s1, s2, 0x31); double* f = (double*) (&temp); return f[0] ; }
159 inline void sse_store(float* p, __m128 s) { _mm_store_ps(p, s); }
160 inline void sse_store(double* p, __m128d s) { _mm_store_pd(p, s); }
161 
162 inline void data_prefetch(const void* p) {
163  _mm_prefetch((const char*)p, _MM_HINT_NTA);
164 }
165 };
166 
167 namespace ProgramCPU {
168 using namespace MYSSE;
169 #define SSE_ZERO SSE<Float>::zero()
170 #define SSE_T typename SSE<Float>::sse_type
172 inline void ScaleJ4(float* jcx, float* jcy, const float* sj) {
173  __m128 ps = _mm_load_ps(sj);
174  _mm_store_ps(jcx, _mm_mul_ps(_mm_load_ps(jcx), ps));
175  _mm_store_ps(jcy, _mm_mul_ps(_mm_load_ps(jcy), ps));
176 }
177 inline void ScaleJ8(float* jcx, float* jcy, const float* sj) {
178  ScaleJ4(jcx, jcy, sj);
179  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
180 }
181 inline void ScaleJ4(double* jcx, double* jcy, const double* sj) {
182  __m128d ps1 = _mm_load_pd(sj), ps2 = _mm_load_pd(sj + 2);
183  _mm_store_pd(jcx, _mm_mul_pd(_mm_load_pd(jcx), ps1));
184  _mm_store_pd(jcy, _mm_mul_pd(_mm_load_pd(jcy), ps1));
185  _mm_store_pd(jcx + 2, _mm_mul_pd(_mm_load_pd(jcx + 2), ps2));
186  _mm_store_pd(jcy + 2, _mm_mul_pd(_mm_load_pd(jcy + 2), ps2));
187 }
188 inline void ScaleJ8(double* jcx, double* jcy, const double* sj) {
189  ScaleJ4(jcx, jcy, sj);
190  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
191 }
192 inline float DotProduct8(const float* v1, const float* v2) {
193  __m128 ds = _mm_add_ps(_mm_mul_ps(_mm_load_ps(v1), _mm_load_ps(v2)),
194  _mm_mul_ps(_mm_load_ps(v1 + 4), _mm_load_ps(v2 + 4)));
195  return sse_sum(ds);
196 }
197 inline double DotProduct8(const double* v1, const double* v2) {
198  __m128d d1 = _mm_mul_pd(_mm_load_pd(v1), _mm_load_pd(v2));
199  __m128d d2 = _mm_mul_pd(_mm_load_pd(v1 + 2), _mm_load_pd(v2 + 2));
200  __m128d d3 = _mm_mul_pd(_mm_load_pd(v1 + 4), _mm_load_pd(v2 + 4));
201  __m128d d4 = _mm_mul_pd(_mm_load_pd(v1 + 6), _mm_load_pd(v2 + 6));
202  __m128d ds = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4));
203  return sse_sum(ds);
204 }
205 
206 inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc,
207  const float* xp, float* jx) {
208 #ifdef POINT_DATA_ALIGN4
209  __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4),
210  mxp = _mm_load_ps(xp);
211  __m128 ds1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc), xc1),
212  _mm_mul_ps(_mm_load_ps(jc + 4), xc2));
213  __m128 dx1 = _mm_add_ps(ds1, _mm_mul_ps(_mm_load_ps(jp), mxp));
214  jx[0] = sse_sum(dx1);
215  __m128 ds2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc + 8), xc1),
216  _mm_mul_ps(_mm_load_ps(jc + 12), xc2));
217  __m128 dx2 = _mm_add_ps(ds2, _mm_mul_ps(_mm_load_ps(jp + 4), mxp));
218  jx[1] = sse_sum(dx2);
219 #else
220  __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4);
221  __m128 jc1 = _mm_load_ps(jc), jc2 = _mm_load_ps(jc + 4);
222  __m128 jc3 = _mm_load_ps(jc + 8), jc4 = _mm_load_ps(jc + 12);
223  __m128 ds1 = _mm_add_ps(_mm_mul_ps(jc1, xc1), _mm_mul_ps(jc2, xc2));
224  __m128 ds2 = _mm_add_ps(_mm_mul_ps(jc3, xc1), _mm_mul_ps(jc4, xc2));
225  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
226  jx[1] =
227  sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
228  jp[POINT_ALIGN + 2] * xp[2]);
229 /*jx[0] = (sse_dot(jc1, xc1) + sse_dot(jc2, xc2)) + (jp[0] * xp[0] + jp[1] *
230 xp[1] + jp[2] * xp[2]);
231 jx[1] = (sse_dot(jc3, xc1) + sse_dot(jc4, xc2)) + (jp[POINT_ALIGN] * xp[0] +
232 jp[POINT_ALIGN+1] * xp[1] + jp[POINT_ALIGN+2] * xp[2]);*/
233 #endif
234 }
235 
236 inline void ComputeTwoJX(const double* jc, const double* jp, const double* xc,
237  const double* xp, double* jx) {
238  __m128d xc1 = _mm_load_pd(xc), xc2 = _mm_load_pd(xc + 2),
239  xc3 = _mm_load_pd(xc + 4), xc4 = _mm_load_pd(xc + 6);
240  __m128d d1 = _mm_mul_pd(_mm_load_pd(jc), xc1);
241  __m128d d2 = _mm_mul_pd(_mm_load_pd(jc + 2), xc2);
242  __m128d d3 = _mm_mul_pd(_mm_load_pd(jc + 4), xc3);
243  __m128d d4 = _mm_mul_pd(_mm_load_pd(jc + 6), xc4);
244  __m128d ds1 = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4));
245  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
246 
247  __m128d d5 = _mm_mul_pd(_mm_load_pd(jc + 8), xc1);
248  __m128d d6 = _mm_mul_pd(_mm_load_pd(jc + 10), xc2);
249  __m128d d7 = _mm_mul_pd(_mm_load_pd(jc + 12), xc3);
250  __m128d d8 = _mm_mul_pd(_mm_load_pd(jc + 14), xc4);
251  __m128d ds2 = _mm_add_pd(_mm_add_pd(d5, d6), _mm_add_pd(d7, d8));
252  jx[1] =
253  sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
254  jp[POINT_ALIGN + 2] * xp[2]);
255 }
256 
257 // v += ax
258 inline void AddScaledVec8(float a, const float* x, float* v) {
259  __m128 aa = sse_load1(&a);
260  _mm_store_ps(v, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x), aa), _mm_load_ps(v)));
261  _mm_store_ps(v + 4, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x + 4), aa),
262  _mm_load_ps(v + 4)));
263 }
264 // v += ax
265 inline void AddScaledVec8(double a, const double* x, double* v) {
266  __m128d aa = sse_load1(&a);
267  _mm_store_pd(v, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x), aa), _mm_load_pd(v)));
268  _mm_store_pd(v + 2, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 2), aa),
269  _mm_load_pd(v + 2)));
270  _mm_store_pd(v + 4, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 4), aa),
271  _mm_load_pd(v + 4)));
272  _mm_store_pd(v + 6, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 6), aa),
273  _mm_load_pd(v + 6)));
274 }
275 
276 inline void AddBlockJtJ(const float* jc, float* block, int vn) {
277  __m128 j1 = _mm_load_ps(jc);
278  __m128 j2 = _mm_load_ps(jc + 4);
279  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
280  __m128 a = sse_load1(jc);
281  _mm_store_ps(block + 0,
282  _mm_add_ps(_mm_mul_ps(a, j1), _mm_load_ps(block + 0)));
283  _mm_store_ps(block + 4,
284  _mm_add_ps(_mm_mul_ps(a, j2), _mm_load_ps(block + 4)));
285  }
286 }
287 
288 inline void AddBlockJtJ(const double* jc, double* block, int vn) {
289  __m128d j1 = _mm_load_pd(jc);
290  __m128d j2 = _mm_load_pd(jc + 2);
291  __m128d j3 = _mm_load_pd(jc + 4);
292  __m128d j4 = _mm_load_pd(jc + 6);
293  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
294  __m128d a = sse_load1(jc);
295  _mm_store_pd(block + 0,
296  _mm_add_pd(_mm_mul_pd(a, j1), _mm_load_pd(block + 0)));
297  _mm_store_pd(block + 2,
298  _mm_add_pd(_mm_mul_pd(a, j2), _mm_load_pd(block + 2)));
299  _mm_store_pd(block + 4,
300  _mm_add_pd(_mm_mul_pd(a, j3), _mm_load_pd(block + 4)));
301  _mm_store_pd(block + 6,
302  _mm_add_pd(_mm_mul_pd(a, j4), _mm_load_pd(block + 6)));
303  }
304 }
305 };
306 #endif
307 
308 #ifdef CPUPBA_USE_AVX
309 #define CPUPBA_USE_SIMD
310 namespace MYAVX {
311 template <class Float>
312 class SSE {};
313 template <>
314 class SSE<float> {
315  public:
316  typedef __m256 sse_type; // static size_t step() {return 4;}
317  static inline sse_type zero() { return _mm256_setzero_ps(); }
318 };
319 template <>
320 class SSE<double> {
321  public:
322  typedef __m256d sse_type; // static size_t step() {return 2;}
323  static inline sse_type zero() { return _mm256_setzero_pd(); }
324 };
325 
327 template <class Float>
328 inline size_t sse_step() {
329  return 32 / sizeof(Float);
330 };
331 inline __m256 sse_load1(const float* p) { return _mm256_broadcast_ss(p); }
332 inline __m256 sse_load(const float* p) { return _mm256_load_ps(p); }
333 inline __m256 sse_add(__m256 s1, __m256 s2) { return _mm256_add_ps(s1, s2); }
334 inline __m256 sse_sub(__m256 s1, __m256 s2) { return _mm256_sub_ps(s1, s2); }
335 inline __m256 sse_mul(__m256 s1, __m256 s2) { return _mm256_mul_ps(s1, s2); }
336 inline __m256 sse_sqrt(__m256 s) { return _mm256_sqrt_ps(s); }
337 
338 // inline __m256 sse_fmad(__m256 a, __m256 b, __m256 c) {return
339 // _mm256_fmadd_ps(a, b, c);}
340 
341 inline __m256d sse_load1(const double* p) { return _mm256_broadcast_sd(p); }
342 inline __m256d sse_load(const double* p) { return _mm256_load_pd(p); }
343 inline __m256d sse_add(__m256d s1, __m256d s2) { return _mm256_add_pd(s1, s2); }
344 inline __m256d sse_sub(__m256d s1, __m256d s2) { return _mm256_sub_pd(s1, s2); }
345 inline __m256d sse_mul(__m256d s1, __m256d s2) { return _mm256_mul_pd(s1, s2); }
346 inline __m256d sse_sqrt(__m256d s) { return _mm256_sqrt_pd(s); }
347 
348 #ifdef _WIN32
349 inline float sse_sum(__m256 s) {
350  return ((s.m256_f32[0] + s.m256_f32[4]) + (s.m256_f32[2] + s.m256_f32[6])) +
351  ((s.m256_f32[1] + s.m256_f32[5]) + (s.m256_f32[3] + s.m256_f32[7]));
352 }
353 inline double sse_sum(__m256d s) {
354  return (s.m256d_f64[0] + s.m256d_f64[2]) + (s.m256d_f64[1] + s.m256d_f64[3]);
355 }
356 #else
357 inline float sse_sum(__m256 s) {
358  float* f = (float*)(&s);
359  return ((f[0] + f[4]) + (f[2] + f[6])) + ((f[1] + f[5]) + (f[3] + f[7]));
360 }
361 inline double sse_sum(__m256d s) {
362  double* d = (double*)(&s);
363  return (d[0] + d[2]) + (d[1] + d[3]);
364 }
365 #endif
366 inline float sse_dot(__m256 s1, __m256 s2) {
367  __m256 temp = _mm256_dp_ps(s1, s2, 0xf1);
368  float* f = (float*)(&temp);
369  return f[0] + f[4];
370 }
371 inline double sse_dot(__m256d s1, __m256d s2) {
372  return sse_sum(sse_mul(s1, s2));
373 }
374 
375 inline void sse_store(float* p, __m256 s) { _mm256_store_ps(p, s); }
376 inline void sse_store(double* p, __m256d s) { _mm256_store_pd(p, s); }
377 
378 inline void data_prefetch(const void* p) {
379  _mm_prefetch((const char*)p, _MM_HINT_NTA);
380 }
381 };
382 
383 namespace ProgramCPU {
384 using namespace MYAVX;
385 #define SSE_ZERO SSE<Float>::zero()
386 #define SSE_T typename SSE<Float>::sse_type
387 
389 inline void ScaleJ8(float* jcx, float* jcy, const float* sj) {
390  __m256 ps = _mm256_load_ps(sj);
391  _mm256_store_ps(jcx, _mm256_mul_ps(_mm256_load_ps(jcx), ps));
392  _mm256_store_ps(jcy, _mm256_mul_ps(_mm256_load_ps(jcy), ps));
393 }
394 inline void ScaleJ4(double* jcx, double* jcy, const double* sj) {
395  __m256d ps = _mm256_load_pd(sj);
396  _mm256_store_pd(jcx, _mm256_mul_pd(_mm256_load_pd(jcx), ps));
397  _mm256_store_pd(jcy, _mm256_mul_pd(_mm256_load_pd(jcy), ps));
398 }
399 inline void ScaleJ8(double* jcx, double* jcy, const double* sj) {
400  ScaleJ4(jcx, jcy, sj);
401  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
402 }
403 inline float DotProduct8(const float* v1, const float* v2) {
404  return sse_dot(_mm256_load_ps(v1), _mm256_load_ps(v2));
405 }
406 inline double DotProduct8(const double* v1, const double* v2) {
407  __m256d ds = _mm256_add_pd(
408  _mm256_mul_pd(_mm256_load_pd(v1), _mm256_load_pd(v2)),
409  _mm256_mul_pd(_mm256_load_pd(v1 + 4), _mm256_load_pd(v2 + 4)));
410  return sse_sum(ds);
411 }
412 
413 inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc,
414  const float* xp, float* jx) {
415  __m256 xcm = _mm256_load_ps(xc), jc1 = _mm256_load_ps(jc),
416  jc2 = _mm256_load_ps(jc + 8);
417  jx[0] = sse_dot(jc1, xcm) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
418  jx[1] = sse_dot(jc2, xcm) +
419  (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
420  jp[POINT_ALIGN + 2] * xp[2]);
421 }
422 
423 inline void ComputeTwoJX(const double* jc, const double* jp, const double* xc,
424  const double* xp, double* jx) {
425  __m256d xc1 = _mm256_load_pd(xc), xc2 = _mm256_load_pd(xc + 4);
426  __m256d jc1 = _mm256_load_pd(jc), jc2 = _mm256_load_pd(jc + 4);
427  __m256d jc3 = _mm256_load_pd(jc + 8), jc4 = _mm256_load_pd(jc + 12);
428  __m256d ds1 = _mm256_add_pd(_mm256_mul_pd(jc1, xc1), _mm256_mul_pd(jc2, xc2));
429  __m256d ds2 = _mm256_add_pd(_mm256_mul_pd(jc3, xc1), _mm256_mul_pd(jc4, xc2));
430  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
431  jx[1] =
432  sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
433  jp[POINT_ALIGN + 2] * xp[2]);
434 }
435 
436 // v += ax
437 inline void AddScaledVec8(float a, const float* x, float* v) {
438  __m256 aa = sse_load1(&a);
439  _mm256_store_ps(v, _mm256_add_ps(_mm256_mul_ps(_mm256_load_ps(x), aa),
440  _mm256_load_ps(v)));
441  //_mm256_store_ps(v, _mm256_fmadd_ps(_mm256_load_ps(x), aa,
442  //_mm256_load_ps(v)));
443 }
444 // v += ax
445 inline void AddScaledVec8(double a, const double* x, double* v) {
446  __m256d aa = sse_load1(&a);
447  _mm256_store_pd(v, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x), aa),
448  _mm256_load_pd(v)));
449  _mm256_store_pd(v + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x + 4), aa),
450  _mm256_load_pd(v + 4)));
451 }
452 
453 inline void AddBlockJtJ(const float* jc, float* block, int vn) {
454  __m256 j = _mm256_load_ps(jc);
455  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
456  __m256 a = sse_load1(jc);
457  _mm256_store_ps(block,
458  _mm256_add_ps(_mm256_mul_ps(a, j), _mm256_load_ps(block)));
459  }
460 }
461 
462 inline void AddBlockJtJ(const double* jc, double* block, int vn) {
463  __m256d j1 = _mm256_load_pd(jc);
464  __m256d j2 = _mm256_load_pd(jc + 4);
465  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
466  __m256d a = sse_load1(jc);
467  _mm256_store_pd(block + 0, _mm256_add_pd(_mm256_mul_pd(a, j1),
468  _mm256_load_pd(block + 0)));
469  _mm256_store_pd(block + 4, _mm256_add_pd(_mm256_mul_pd(a, j2),
470  _mm256_load_pd(block + 4)));
471  }
472 }
473 };
474 
475 #endif
476 
477 #ifdef CPUPBA_USE_NEON
478 #define CPUPBA_USE_SIMD
479 #define SIMD_NO_SQRT
480 #define SIMD_NO_DOUBLE
481 namespace MYNEON {
482 template <class Float>
483 class SSE {};
484 template <>
485 class SSE<float> {
486  public:
487  typedef float32x4_t sse_type;
488 };
489 
491 template <class Float>
492 inline size_t sse_step() {
493  return 16 / sizeof(Float);
494 };
495 inline float32x4_t sse_load1(const float* p) { return vld1q_dup_f32(p); }
496 inline float32x4_t sse_load(const float* p) { return vld1q_f32(p); }
497 inline float32x4_t sse_loadzero() {
498  float z = 0;
499  return sse_load1(&z);
500 }
501 inline float32x4_t sse_add(float32x4_t s1, float32x4_t s2) {
502  return vaddq_f32(s1, s2);
503 }
504 inline float32x4_t sse_sub(float32x4_t s1, float32x4_t s2) {
505  return vsubq_f32(s1, s2);
506 }
507 inline float32x4_t sse_mul(float32x4_t s1, float32x4_t s2) {
508  return vmulq_f32(s1, s2);
509 }
510 // inline float32x4_t sse_sqrt(float32x4_t s) {return
511 // _mm_sqrt_ps(s); }
512 inline float sse_sum(float32x4_t s) {
513  float* f = (float*)(&s);
514  return (f[0] + f[2]) + (f[1] + f[3]);
515 }
516 inline void sse_store(float* p, float32x4_t s) { vst1q_f32(p, s); }
517 inline void data_prefetch(const void* p) {}
518 };
519 namespace ProgramCPU {
520 using namespace MYNEON;
521 #define SSE_ZERO sse_loadzero()
522 #define SSE_T typename SSE<Float>::sse_type
524 inline void ScaleJ4(float* jcx, float* jcy, const float* sj) {
525  float32x4_t ps = sse_load(sj);
526  sse_store(jcx, sse_mul(sse_load(jcx), ps));
527  sse_store(jcy, sse_mul(sse_load(jcy), ps));
528 }
529 inline void ScaleJ8(float* jcx, float* jcy, const float* sj) {
530  ScaleJ4(jcx, jcy, sj);
531  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
532 }
533 
534 inline float DotProduct8(const float* v1, const float* v2) {
535  float32x4_t ds = sse_add(sse_mul(sse_load(v1), sse_load(v2)),
536  sse_mul(sse_load(v1 + 4), sse_load(v2 + 4)));
537  return sse_sum(ds);
538 }
539 
540 inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc,
541  const float* xp, float* jx) {
542 #ifdef POINT_DATA_ALIGN4
543  float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4), mxp = sse_load(xp);
544  float32x4_t ds1 =
545  sse_add(sse_mul(sse_load(jc), xc1), sse_mul(sse_load(jc + 4), xc2));
546  float32x4_t dx1 = sse_add(ds1, sse_mul(sse_load(jp), mxp));
547  jx[0] = sse_sum(dx1);
548  float32x4_t ds2 =
549  sse_add(sse_mul(sse_load(jc + 8), xc1), sse_mul(sse_load(jc + 12), xc2));
550  float32x4_t dx2 = sse_add(ds2, sse_mul(sse_load(jp + 4), mxp));
551  jx[1] = sse_sum(dx2);
552 #else
553  float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4);
554  float32x4_t jc1 = sse_load(jc), jc2 = sse_load(jc + 4);
555  float32x4_t jc3 = sse_load(jc + 8), jc4 = sse_load(jc + 12);
556  float32x4_t ds1 = sse_add(sse_mul(jc1, xc1), sse_mul(jc2, xc2));
557  float32x4_t ds2 = sse_add(sse_mul(jc3, xc1), sse_mul(jc4, xc2));
558  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
559  jx[1] =
560  sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
561  jp[POINT_ALIGN + 2] * xp[2]);
562 /*jx[0] = (sse_dot(jc1, xc1) + sse_dot(jc2, xc2)) + (jp[0] * xp[0] + jp[1] *
563 xp[1] + jp[2] * xp[2]);
564 jx[1] = (sse_dot(jc3, xc1) + sse_dot(jc4, xc2)) + (jp[POINT_ALIGN] * xp[0] +
565 jp[POINT_ALIGN+1] * xp[1] + jp[POINT_ALIGN+2] * xp[2]);*/
566 #endif
567 }
568 
569 // v += ax
570 inline void AddScaledVec8(float a, const float* x, float* v) {
571  float32x4_t aa = sse_load1(&a);
572  sse_store(v, sse_add(sse_mul(sse_load(x), aa), sse_load(v)));
573  sse_store(v + 4, sse_add(sse_mul(sse_load(x + 4), aa), sse_load(v + 4)));
574 }
575 
576 inline void AddBlockJtJ(const float* jc, float* block, int vn) {
577  float32x4_t j1 = sse_load(jc);
578  float32x4_t j2 = sse_load(jc + 4);
579  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
580  float32x4_t a = sse_load1(jc);
581  sse_store(block + 0, sse_add(sse_mul(a, j1), sse_load(block + 0)));
582  sse_store(block + 4, sse_add(sse_mul(a, j2), sse_load(block + 4)));
583  }
584 }
585 };
586 #endif
587 
588 namespace ProgramCPU {
590 template <class Float>
591 double ComputeVectorNorm(const avec<Float>& vec, int mt = 0);
592 
593 #if defined(CPUPBA_USE_SIMD)
594 template <class Float>
595 void ComputeSQRT(avec<Float>& vec) {
596 #ifndef SIMD_NO_SQRT
597  const size_t step = sse_step<Float>();
598  Float *p = &vec[0], *pe = p + vec.size(), *pex = pe - step;
599  for (; p <= pex; p += step) sse_store(p, sse_sqrt(sse_load(p)));
600  for (; p < pe; ++p) p[0] = sqrt(p[0]);
601 #else
602  for (Float* it = vec.begin(); it < vec.end(); ++it) *it = sqrt(*it);
603 #endif
604 }
605 
606 template <class Float>
607 void ComputeRSQRT(avec<Float>& vec) {
608  Float *p = &vec[0], *pe = p + vec.size();
609  for (; p < pe; ++p) p[0] = (p[0] == 0 ? 0 : Float(1.0) / p[0]);
610  ComputeSQRT(vec);
611 }
612 
613 template <class Float>
614 void SetVectorZero(Float* p, Float* pe) {
615  SSE_T sse = SSE_ZERO;
616  const size_t step = sse_step<Float>();
617  Float* pex = pe - step;
618  for (; p <= pex; p += step) sse_store(p, sse);
619  for (; p < pe; ++p) *p = 0;
620 }
621 
622 template <class Float>
623 void SetVectorZero(avec<Float>& vec) {
624  Float *p = &vec[0], *pe = p + vec.size();
625  SetVectorZero(p, pe);
626 }
627 
628 // function not used
629 template <class Float>
630 inline void MemoryCopyA(const Float* p, const Float* pe, Float* d) {
631  const size_t step = sse_step<Float>();
632  const Float* pex = pe - step;
633  for (; p <= pex; p += step, d += step) sse_store(d, sse_load(p));
634  // while(p < pe) *d++ = *p++;
635 }
636 
637 template <class Float>
638 void ComputeVectorNorm(const Float* p, const Float* pe, double* psum) {
639  SSE_T sse = SSE_ZERO;
640  const size_t step = sse_step<Float>();
641  const Float* pex = pe - step;
642  for (; p <= pex; p += step) {
643  SSE_T ps = sse_load(p);
644  sse = sse_add(sse, sse_mul(ps, ps));
645  }
646  double sum = sse_sum(sse);
647  for (; p < pe; ++p) sum += p[0] * p[0];
648  *psum = sum;
649 }
650 
651 template <class Float>
652 double ComputeVectorNormW(const avec<Float>& vec, const avec<Float>& weight) {
653  if (weight.begin() != NULL) {
654  SSE_T sse = SSE_ZERO;
655  const size_t step = sse_step<Float>();
656  const Float *p = vec, *pe = p + vec.size(), *pex = pe - step;
657  const Float* w = weight;
658  for (; p <= pex; p += step, w += step) {
659  SSE_T pw = sse_load(w), ps = sse_load(p);
660  sse = sse_add(sse, sse_mul(sse_mul(ps, pw), ps));
661  }
662  double sum = sse_sum(sse);
663  for (; p < pe; ++p, ++w) sum += p[0] * w[0] * p[0];
664  return sum;
665  } else {
666  return ComputeVectorNorm<Float>(vec, 0);
667  }
668 }
669 
670 template <class Float>
671 double ComputeVectorDot(const avec<Float>& vec1, const avec<Float>& vec2) {
672  SSE_T sse = SSE_ZERO;
673  const size_t step = sse_step<Float>();
674  const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step;
675  const Float* p2 = vec2;
676  for (; p1 <= pex; p1 += step, p2 += step) {
677  SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
678  sse = sse_add(sse, sse_mul(ps1, ps2));
679  }
680  double sum = sse_sum(sse);
681  for (; p1 < pe; ++p1, ++p2) sum += p1[0] * p2[0];
682  return sum;
683 }
684 
685 template <class Float>
686 void ComputeVXY(const avec<Float>& vec1, const avec<Float>& vec2,
687  avec<Float>& result, size_t part = 0, size_t skip = 0) {
688  const size_t step = sse_step<Float>();
689  const Float *p1 = vec1 + skip, *pe = p1 + (part ? part : vec1.size()),
690  *pex = pe - step;
691  const Float* p2 = vec2 + skip;
692  Float* p3 = result + skip;
693  for (; p1 <= pex; p1 += step, p2 += step, p3 += step) {
694  SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
695  sse_store(p3, sse_mul(ps1, ps2));
696  }
697  for (; p1 < pe; ++p1, ++p2, ++p3) *p3 = p1[0] * p2[0];
698 }
699 
700 template <class Float>
701 void ComputeSAXPY(Float a, const Float* p1, const Float* p2, Float* p3,
702  Float* pe) {
703  const size_t step = sse_step<Float>();
704  SSE_T aa = sse_load1(&a);
705  Float* pex = pe - step;
706  if (a == 1.0f) {
707  for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
708  SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
709  sse_store(p3, sse_add(ps2, ps1));
710  }
711  } else if (a == -1.0f) {
712  for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
713  SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
714  sse_store(p3, sse_sub(ps2, ps1));
715  }
716  } else {
717  for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
718  SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
719  sse_store(p3, sse_add(ps2, sse_mul(ps1, aa)));
720  }
721  }
722  for (; p3 < pe; ++p1, ++p2, ++p3) p3[0] = a * p1[0] + p2[0];
723 }
724 
725 template <class Float>
726 void ComputeSAX(Float a, const avec<Float>& vec1, avec<Float>& result) {
727  const size_t step = sse_step<Float>();
728  SSE_T aa = sse_load1(&a);
729  const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step;
730  Float* p3 = result;
731  for (; p1 <= pex; p1 += step, p3 += step) {
732  sse_store(p3, sse_mul(sse_load(p1), aa));
733  }
734  for (; p1 < pe; ++p1, ++p3) p3[0] = a * p1[0];
735 }
736 
737 template <class Float>
738 inline void ComputeSXYPZ(Float a, const Float* p1, const Float* p2,
739  const Float* p3, Float* p4, Float* pe) {
740  const size_t step = sse_step<Float>();
741  SSE_T aa = sse_load1(&a);
742  Float* pex = pe - step;
743  for (; p4 <= pex; p1 += step, p2 += step, p3 += step, p4 += step) {
744  SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2), ps3 = sse_load(p3);
745  sse_store(p4, sse_add(ps3, sse_mul(sse_mul(ps1, aa), ps2)));
746  }
747  for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) p4[0] = a * p1[0] * p2[0] + p3[0];
748 }
749 
750 #else
751 template <class Float>
753  Float* it = vec.begin();
754  for (; it < vec.end(); ++it) {
755  *it = sqrt(*it);
756  }
757 }
758 template <class Float>
760  Float* it = vec.begin();
761  for (; it < vec.end(); ++it) {
762  *it = (*it == 0 ? 0 : Float(1.0) / sqrt(*it));
763  }
764 }
765 template <class Float>
766 inline void SetVectorZero(Float* p, Float* pe) {
767  std::fill(p, pe, 0);
768 }
769 template <class Float>
770 inline void SetVectorZero(avec<Float>& vec) {
771  std::fill(vec.begin(), vec.end(), 0);
772 }
773 
774 template <class Float>
775 inline void MemoryCopyA(const Float* p, const Float* pe, Float* d) {
776  while (p < pe) *d++ = *p++;
777 }
778 
779 template <class Float>
780 double ComputeVectorNormW(const avec<Float>& vec, const avec<Float>& weight) {
781  double sum = 0;
782  const Float *it1 = vec.begin(), *it2 = weight.begin();
783  for (; it1 < vec.end(); ++it1, ++it2) {
784  sum += (*it1) * (*it2) * (*it1);
785  }
786  return sum;
787 }
788 
789 template <class Float>
790 double ComputeVectorDot(const avec<Float>& vec1, const avec<Float>& vec2) {
791  double sum = 0;
792  const Float *it1 = vec1.begin(), *it2 = vec2.begin();
793  for (; it1 < vec1.end(); ++it1, ++it2) {
794  sum += (*it1) * (*it2);
795  }
796  return sum;
797 }
798 template <class Float>
799 void ComputeVectorNorm(const Float* p, const Float* pe, double* psum) {
800  double sum = 0;
801  for (; p < pe; ++p) sum += (*p) * (*p);
802  *psum = sum;
803 }
804 template <class Float>
805 inline void ComputeVXY(const avec<Float>& vec1, const avec<Float>& vec2,
806  avec<Float>& result, size_t part = 0, size_t skip = 0) {
807  const Float *it1 = vec1.begin() + skip, *it2 = vec2.begin() + skip;
808  const Float* ite = part ? (it1 + part) : vec1.end();
809  Float* it3 = result.begin() + skip;
810  for (; it1 < ite; ++it1, ++it2, ++it3) {
811  (*it3) = (*it1) * (*it2);
812  }
813 }
814 template <class Float>
815 void ScaleJ8(Float* jcx, Float* jcy, const Float* sj) {
816  for (int i = 0; i < 8; ++i) {
817  jcx[i] *= sj[i];
818  jcy[i] *= sj[i];
819  }
820 }
821 
822 template <class Float>
823 inline void AddScaledVec8(Float a, const Float* x, Float* v) {
824  for (int i = 0; i < 8; ++i) v[i] += (a * x[i]);
825 }
826 
827 template <class Float>
828 void ComputeSAX(Float a, const avec<Float>& vec1, avec<Float>& result) {
829  const Float* it1 = vec1.begin();
830  Float* it3 = result.begin();
831  for (; it1 < vec1.end(); ++it1, ++it3) {
832  (*it3) = (a * (*it1));
833  }
834 }
835 
836 template <class Float>
837 inline void ComputeSXYPZ(Float a, const Float* p1, const Float* p2,
838  const Float* p3, Float* p4, Float* pe) {
839  for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) *p4 = (a * (*p1) * (*p2) + (*p3));
840 }
841 
842 template <class Float>
843 void ComputeSAXPY(Float a, const Float* it1, const Float* it2, Float* it3,
844  Float* ite) {
845  if (a == (Float)1.0) {
846  for (; it3 < ite; ++it1, ++it2, ++it3) {
847  (*it3) = ((*it1) + (*it2));
848  }
849  } else {
850  for (; it3 < ite; ++it1, ++it2, ++it3) {
851  (*it3) = (a * (*it1) + (*it2));
852  }
853  }
854 }
855 template <class Float>
856 void AddBlockJtJ(const Float* jc, Float* block, int vn) {
857  for (int i = 0; i < vn; ++i) {
858  Float *row = block + i * 8, a = jc[i];
859  for (int j = 0; j < vn; ++j) row[j] += a * jc[j];
860  }
861 }
862 #endif
863 
864 #ifdef _WIN32
865 #define DEFINE_THREAD_DATA(X) \
866  template <class Float> \
867  struct X##_STRUCT {
868 #define DECLEAR_THREAD_DATA(X, ...) \
869  X##_STRUCT<Float> tdata = {__VA_ARGS__}; \
870  X##_STRUCT<Float>* newdata = new X##_STRUCT<Float>(tdata)
871 #define BEGIN_THREAD_PROC(X) \
872  } \
873  ; \
874  template <class Float> \
875  DWORD X##_PROC(X##_STRUCT<Float>* q) {
876 #define END_THREAD_RPOC(X) \
877  delete q; \
878  return 0; \
879  }
880 
881 #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
882 #define MYTHREAD std::thread
883 #define RUN_THREAD(X, t, ...) \
884  DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
885  t = std::thread(X##_PROC<Float>, newdata)
886 #define WAIT_THREAD(tv, n) \
887  { \
888  for (size_t i = 0; i < size_t(n); ++i) tv[i].join(); \
889  }
890 #else
891 #define MYTHREAD HANDLE
892 #define RUN_THREAD(X, t, ...) \
893  DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
894  t = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)X##_PROC<Float>, newdata, \
895  0, 0)
896 #define WAIT_THREAD(tv, n) \
897  { \
898  WaitForMultipleObjects((DWORD)n, tv, TRUE, INFINITE); \
899  for (size_t i = 0; i < size_t(n); ++i) CloseHandle(tv[i]); \
900  }
901 #endif
902 #else
903 #define DEFINE_THREAD_DATA(X) \
904  template <class Float> \
905  struct X##_STRUCT { \
906  int tid;
907 #define DECLEAR_THREAD_DATA(X, ...) \
908  X##_STRUCT<Float> tdata = {i, __VA_ARGS__}; \
909  X##_STRUCT<Float>* newdata = new X##_STRUCT<Float>(tdata)
910 #define BEGIN_THREAD_PROC(X) \
911  } \
912  ; \
913  template <class Float> \
914  void* X##_PROC(X##_STRUCT<Float>* q) {
915 // cpu_set_t mask; CPU_ZERO( &mask );
916 // CPU_SET( q->tid, &mask );
917 // if( sched_setaffinity(0, sizeof(mask), &mask
918 // ) == -1 )
919 // std::cout <<"WARNING: Could not set CPU
920 // Affinity, continuing...\n";
921 #define END_THREAD_RPOC(X) \
922  delete q; \
923  return 0; \
924  } \
925  template <class Float> \
926  struct X##_FUNCTOR { \
927  typedef void* (*func_type)(X##_STRUCT<Float>*); \
928  static func_type get() { return &(X##_PROC<Float>); } \
929  };
930 #define MYTHREAD pthread_t
931 
932 #define RUN_THREAD(X, t, ...) \
933  DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
934  pthread_create(&t, NULL, (void* (*)(void*))X##_FUNCTOR<Float>::get(), newdata)
935 #define WAIT_THREAD(tv, n) \
936  { \
937  for (size_t i = 0; i < size_t(n); ++i) pthread_join(tv[i], NULL); \
938  }
939 #endif
940 template <class Float>
941 inline void MemoryCopyB(const Float* p, const Float* pe, Float* d) {
942  while (p < pe) *d++ = *p++;
943 }
944 
945 template <class Float>
946 inline Float DotProduct8(const Float* v1, const Float* v2) {
947  return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2] + v1[3] * v2[3] +
948  v1[4] * v2[4] + v1[5] * v2[5] + v1[6] * v2[6] + v1[7] * v2[7];
949 }
950 template <class Float>
951 inline void ComputeTwoJX(const Float* jc, const Float* jp, const Float* xc,
952  const Float* xp, Float* jx) {
953  jx[0] = DotProduct8(jc, xc) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
954  jx[1] =
955  DotProduct8(jc + 8, xc) + (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
956 }
957 template <class Float>
958 Float ComputeVectorMax(const avec<Float>& vec) {
959  Float v = 0;
960  const Float* it = vec.begin();
961  for (; it < vec.end(); ++it) {
962  Float vi = (Float)fabs(*it);
963  v = std::max(v, vi);
964  }
965  return v;
966 }
967 
968 template <class Float>
969 void ComputeSXYPZ(Float a, const avec<Float>& vec1, const avec<Float>& vec2,
970  const avec<Float>& vec3, avec<Float>& result) {
971  if (vec1.begin() != NULL) {
972  const Float *p1 = &vec1[0], *p2 = &vec2[0], *p3 = &vec3[0];
973  Float *p4 = &result[0], *pe = p4 + result.size();
974  ComputeSXYPZ(a, p1, p2, p3, p4, pe);
975 
976  } else {
977  // ComputeSAXPY<Float>(a, vec2, vec3, result, 0);
978  ComputeSAXPY<Float>(a, vec2.begin(), vec3.begin(), result.begin(),
979  result.end());
980  }
981 }
982 
984 Float a;
985 const Float *p1, *p2;
986 Float *p3, *pe;
988 ComputeSAXPY(q->a, q->p1, q->p2, q->p3, q->pe);
990 
991 template <class Float>
992 void ComputeSAXPY(Float a, const avec<Float>& vec1, const avec<Float>& vec2,
993  avec<Float>& result, int mt = 0) {
994  const bool auto_multi_thread = true;
995  if (auto_multi_thread && mt == 0) {
996  mt = AUTO_MT_NUM(result.size() * 2);
997  }
998  if (mt > 1 && result.size() >= mt * 4) {
999  MYTHREAD threads[THREAD_NUM_MAX];
1000  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1001  const Float *p1 = vec1.begin(), *p2 = vec2.begin();
1002  Float* p3 = result.begin();
1003  for (size_t i = 0; i < thread_num; ++i) {
1004  size_t first = (result.size() * i / thread_num + FLOAT_ALIGN - 1) /
1006  size_t last_ = (result.size() * (i + 1) / thread_num + FLOAT_ALIGN - 1) /
1008  size_t last = std::min(last_, result.size());
1009  RUN_THREAD(ComputeSAXPY, threads[i], a, p1 + first, p2 + first,
1010  p3 + first, p3 + last);
1011  }
1012  WAIT_THREAD(threads, thread_num);
1013  } else {
1014  ComputeSAXPY(a, vec1.begin(), vec2.begin(), result.begin(), result.end());
1015  }
1016 }
1017 
1019 const Float *p, *pe;
1020 double* sum;
1022 ComputeVectorNorm(q->p, q->pe, q->sum);
1024 
1025 template <class Float>
1026 double ComputeVectorNorm(const avec<Float>& vec, int mt) {
1027  const bool auto_multi_thread = true;
1028  if (auto_multi_thread && mt == 0) {
1029  mt = AUTO_MT_NUM(vec.size());
1030  }
1031  if (mt > 1 && vec.size() >= mt * 4) {
1032  MYTHREAD threads[THREAD_NUM_MAX];
1033  double sumv[THREAD_NUM_MAX];
1034  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1035  const Float* p = vec;
1036  for (size_t i = 0; i < thread_num; ++i) {
1037  size_t first = (vec.size() * i / thread_num + FLOAT_ALIGN - 1) /
1039  size_t last_ = (vec.size() * (i + 1) / thread_num + FLOAT_ALIGN - 1) /
1041  size_t last = std::min(last_, vec.size());
1042  RUN_THREAD(ComputeVectorNorm, threads[i], p + first, p + last, sumv + i);
1043  }
1044  WAIT_THREAD(threads, thread_num);
1045  double sum = 0;
1046  for (size_t i = 0; i < thread_num; ++i) sum += sumv[i];
1047  return sum;
1048  } else {
1049  double sum;
1050  ComputeVectorNorm(vec.begin(), vec.end(), &sum);
1051  return sum;
1052  }
1053 }
1054 
1055 template <class Float>
1056 void GetRodriguesRotation(const Float m[3][3], Float r[3]) {
1057  // http://www.euclideanspace.com/maths/geometry/rotations/conversions/matrixToAngle/index.htm
1058  double a = (m[0][0] + m[1][1] + m[2][2] - 1.0) / 2.0;
1059  const double epsilon = 0.01;
1060  if (fabs(m[0][1] - m[1][0]) < epsilon && fabs(m[1][2] - m[2][1]) < epsilon &&
1061  fabs(m[0][2] - m[2][0]) < epsilon) {
1062  if (fabs(m[0][1] + m[1][0]) < 0.1 && fabs(m[1][2] + m[2][1]) < 0.1 &&
1063  fabs(m[0][2] + m[2][0]) < 0.1 && a > 0.9) {
1064  r[0] = 0;
1065  r[1] = 0;
1066  r[2] = 0;
1067  } else {
1068  const Float ha = Float(sqrt(0.5) * 3.14159265358979323846);
1069  double xx = (m[0][0] + 1.0) / 2.0;
1070  double yy = (m[1][1] + 1.0) / 2.0;
1071  double zz = (m[2][2] + 1.0) / 2.0;
1072  double xy = (m[0][1] + m[1][0]) / 4.0;
1073  double xz = (m[0][2] + m[2][0]) / 4.0;
1074  double yz = (m[1][2] + m[2][1]) / 4.0;
1075 
1076  if ((xx > yy) && (xx > zz)) {
1077  if (xx < epsilon) {
1078  r[0] = 0;
1079  r[1] = r[2] = ha;
1080  } else {
1081  double t = sqrt(xx);
1082  r[0] = Float(t * 3.14159265358979323846);
1083  r[1] = Float(xy / t * 3.14159265358979323846);
1084  r[2] = Float(xz / t * 3.14159265358979323846);
1085  }
1086  } else if (yy > zz) {
1087  if (yy < epsilon) {
1088  r[0] = r[2] = ha;
1089  r[1] = 0;
1090  } else {
1091  double t = sqrt(yy);
1092  r[0] = Float(xy / t * 3.14159265358979323846);
1093  r[1] = Float(t * 3.14159265358979323846);
1094  r[2] = Float(yz / t * 3.14159265358979323846);
1095  }
1096  } else {
1097  if (zz < epsilon) {
1098  r[0] = r[1] = ha;
1099  r[2] = 0;
1100  } else {
1101  double t = sqrt(zz);
1102  r[0] = Float(xz / t * 3.14159265358979323846);
1103  r[1] = Float(yz / t * 3.14159265358979323846);
1104  r[2] = Float(t * 3.14159265358979323846);
1105  }
1106  }
1107  }
1108  } else {
1109  a = acos(a);
1110  double b = 0.5 * a / sin(a);
1111  r[0] = Float(b * (m[2][1] - m[1][2]));
1112  r[1] = Float(b * (m[0][2] - m[2][0]));
1113  r[2] = Float(b * (m[1][0] - m[0][1]));
1114  }
1115 }
1116 template <class Float>
1117 void UncompressRodriguesRotation(const Float r[3], Float m[]) {
1118  double a = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]);
1119  double ct = a == 0.0 ? 0.5f : (1.0f - cos(a)) / a / a;
1120  double st = a == 0.0 ? 1 : sin(a) / a;
1121  m[0] = Float(1.0 - (r[1] * r[1] + r[2] * r[2]) * ct);
1122  m[1] = Float(r[0] * r[1] * ct - r[2] * st);
1123  m[2] = Float(r[2] * r[0] * ct + r[1] * st);
1124  m[3] = Float(r[0] * r[1] * ct + r[2] * st);
1125  m[4] = Float(1.0f - (r[2] * r[2] + r[0] * r[0]) * ct);
1126  m[5] = Float(r[1] * r[2] * ct - r[0] * st);
1127  m[6] = Float(r[2] * r[0] * ct - r[1] * st);
1128  m[7] = Float(r[1] * r[2] * ct + r[0] * st);
1129  m[8] = Float(1.0 - (r[0] * r[0] + r[1] * r[1]) * ct);
1130 }
1131 template <class Float>
1132 void UpdateCamera(int ncam, const avec<Float>& camera, const avec<Float>& delta,
1133  avec<Float>& new_camera) {
1134  const Float *c = &camera[0], *d = &delta[0];
1135  Float *nc = &new_camera[0], m[9];
1136  // f[1], t[3], r[3][3], d[1]
1137  for (int i = 0; i < ncam; ++i, c += 16, d += 8, nc += 16) {
1138  nc[0] = max(c[0] + d[0], ((Float)1e-10));
1139  nc[1] = c[1] + d[1];
1140  nc[2] = c[2] + d[2];
1141  nc[3] = c[3] + d[3];
1142  nc[13] = c[13] + d[7];
1143 
1145  UncompressRodriguesRotation(d + 4, m);
1146  nc[4] = m[0] * c[4 + 0] + m[1] * c[4 + 3] + m[2] * c[4 + 6];
1147  nc[5] = m[0] * c[4 + 1] + m[1] * c[4 + 4] + m[2] * c[4 + 7];
1148  nc[6] = m[0] * c[4 + 2] + m[1] * c[4 + 5] + m[2] * c[4 + 8];
1149  nc[7] = m[3] * c[4 + 0] + m[4] * c[4 + 3] + m[5] * c[4 + 6];
1150  nc[8] = m[3] * c[4 + 1] + m[4] * c[4 + 4] + m[5] * c[4 + 7];
1151  nc[9] = m[3] * c[4 + 2] + m[4] * c[4 + 5] + m[5] * c[4 + 8];
1152  nc[10] = m[6] * c[4 + 0] + m[7] * c[4 + 3] + m[8] * c[4 + 6];
1153  nc[11] = m[6] * c[4 + 1] + m[7] * c[4 + 4] + m[8] * c[4 + 7];
1154  nc[12] = m[6] * c[4 + 2] + m[7] * c[4 + 5] + m[8] * c[4 + 8];
1155 
1156  // Float temp[3];
1157  // GetRodriguesRotation((Float (*)[3]) (nc + 4), temp);
1158  // UncompressRodriguesRotation(temp, nc + 4);
1159  nc[14] = c[14];
1160  nc[15] = c[15];
1161  }
1162 }
1163 
1164 template <class Float>
1165 void UpdateCameraPoint(int ncam, const avec<Float>& camera,
1166  const avec<Float>& point, avec<Float>& delta,
1167  avec<Float>& new_camera, avec<Float>& new_point,
1168  int mode, int mt) {
1170  if (mode != 2) {
1171  UpdateCamera(ncam, camera, delta, new_camera);
1172  }
1174  if (mode != 1) {
1175  avec<Float> dp;
1176  dp.set(delta.begin() + 8 * ncam, point.size());
1177  ComputeSAXPY((Float)1.0, dp, point, new_point, mt);
1178  }
1179 }
1180 
1181 template <class Float>
1182 void ComputeProjection(size_t nproj, const Float* camera, const Float* point,
1183  const Float* ms, const int* jmap, Float* pj, int radial,
1184  int mt);
1185 
1187 size_t nproj;
1188 const Float *camera, *point, *ms;
1189 const int* jmap;
1190 Float* pj;
1191 int radial_distortion;
1193 ComputeProjection(q->nproj, q->camera, q->point, q->ms, q->jmap, q->pj,
1194  q->radial_distortion, 0);
1196 
1197 template <class Float>
1198 void ComputeProjection(size_t nproj, const Float* camera, const Float* point,
1199  const Float* ms, const int* jmap, Float* pj, int radial,
1200  int mt) {
1201  if (mt > 1 && nproj >= mt) {
1202  MYTHREAD threads[THREAD_NUM_MAX];
1203  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1204  for (size_t i = 0; i < thread_num; ++i) {
1205  size_t first = nproj * i / thread_num;
1206  size_t last_ = nproj * (i + 1) / thread_num;
1207  size_t last = std::min(last_, nproj);
1208  RUN_THREAD(ComputeProjection, threads[i], last - first, camera, point,
1209  ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial);
1210  }
1211  WAIT_THREAD(threads, thread_num);
1212 
1213  } else {
1214  for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) {
1215  const Float* c = camera + jmap[0] * 16;
1216  const Float* m = point + jmap[1] * POINT_ALIGN;
1218  Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1];
1219  Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2];
1220  Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3];
1221 
1222  if (radial == 1) {
1223  Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2);
1224  Float f_p2 = c[0] * rr / p2;
1225  pj[0] = ms[0] - p0 * f_p2;
1226  pj[1] = ms[1] - p1 * f_p2;
1227  } else if (radial == -1) {
1228  Float f_p2 = c[0] / p2;
1229  Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]);
1230  pj[0] = ms[0] * rd - p0 * f_p2;
1231  pj[1] = ms[1] * rd - p1 * f_p2;
1232  } else {
1233  pj[0] = ms[0] - p0 * c[0] / p2;
1234  pj[1] = ms[1] - p1 * c[0] / p2;
1235  }
1236  }
1237  }
1238 }
1239 
1240 template <class Float>
1241 void ComputeProjectionX(size_t nproj, const Float* camera, const Float* point,
1242  const Float* ms, const int* jmap, Float* pj, int radial,
1243  int mt);
1244 
1246 size_t nproj;
1247 const Float *camera, *point, *ms;
1248 const int* jmap;
1249 Float* pj;
1250 int radial_distortion;
1252 ComputeProjectionX(q->nproj, q->camera, q->point, q->ms, q->jmap, q->pj,
1253  q->radial_distortion, 0);
1255 
1256 template <class Float>
1257 void ComputeProjectionX(size_t nproj, const Float* camera, const Float* point,
1258  const Float* ms, const int* jmap, Float* pj, int radial,
1259  int mt) {
1260  if (mt > 1 && nproj >= mt) {
1261  MYTHREAD threads[THREAD_NUM_MAX];
1262  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1263  for (size_t i = 0; i < thread_num; ++i) {
1264  size_t first = nproj * i / thread_num;
1265  size_t last_ = nproj * (i + 1) / thread_num;
1266  size_t last = std::min(last_, nproj);
1267  RUN_THREAD(ComputeProjectionX, threads[i], last - first, camera, point,
1268  ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial);
1269  }
1270  WAIT_THREAD(threads, thread_num);
1271  } else {
1272  for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) {
1273  const Float* c = camera + jmap[0] * 16;
1274  const Float* m = point + jmap[1] * POINT_ALIGN;
1276  Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1];
1277  Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2];
1278  Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3];
1279  if (radial == 1) {
1280  Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2);
1281  Float f_p2 = c[0] / p2;
1282  pj[0] = ms[0] / rr - p0 * f_p2;
1283  pj[1] = ms[1] / rr - p1 * f_p2;
1284  } else if (radial == -1) {
1285  Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]);
1286  Float f_p2 = c[0] / p2 / rd;
1287  pj[0] = ms[0] - p0 * f_p2;
1288  pj[1] = ms[1] - p1 * f_p2;
1289  } else {
1290  pj[0] = ms[0] - p0 * c[0] / p2;
1291  pj[1] = ms[1] - p1 * c[0] / p2;
1292  }
1293  }
1294  }
1295 }
1296 
1297 template <class Float>
1298 void ComputeProjectionQ(size_t nq, const Float* camera, const int* qmap,
1299  const Float* wq, Float* pj) {
1300  for (size_t i = 0; i < nq; ++i, qmap += 2, pj += 2, wq += 2) {
1301  const Float* c1 = camera + qmap[0] * 16;
1302  const Float* c2 = camera + qmap[1] * 16;
1303  pj[0] = -(c1[0] - c2[0]) * wq[0];
1304  pj[1] = -(c1[13] - c2[13]) * wq[1];
1305  }
1306 }
1307 
1308 template <class Float>
1309 void ComputeJQX(size_t nq, const Float* x, const int* qmap, const Float* wq,
1310  const Float* sj, Float* jx) {
1311  if (sj) {
1312  for (size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) {
1313  int idx1 = qmap[0] * 8, idx2 = qmap[1] * 8;
1314  const Float* x1 = x + idx1;
1315  const Float* x2 = x + idx2;
1316  const Float* sj1 = sj + idx1;
1317  const Float* sj2 = sj + idx2;
1318  jx[0] = (x1[0] * sj1[0] - x2[0] * sj2[0]) * wq[0];
1319  jx[1] = (x1[7] * sj1[7] - x2[7] * sj2[7]) * wq[1];
1320  }
1321  } else {
1322  for (size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) {
1323  const Float* x1 = x + qmap[0] * 8;
1324  const Float* x2 = x + qmap[1] * 8;
1325  jx[0] = (x1[0] - x2[0]) * wq[0];
1326  jx[1] = (x1[7] - x2[7]) * wq[1];
1327  }
1328  }
1329 }
1330 
1331 template <class Float>
1332 void ComputeJQtEC(size_t ncam, const Float* pe, const int* qlist,
1333  const Float* wq, const Float* sj, Float* v) {
1334  if (sj) {
1335  for (size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8, sj += 8) {
1336  int ip = qlist[0];
1337  if (ip == -1) continue;
1338  int in = qlist[1];
1339  const Float* e1 = pe + ip * 2;
1340  const Float* e2 = pe + in * 2;
1341  v[0] += wq[0] * sj[0] * (e1[0] - e2[0]);
1342  v[7] += wq[1] * sj[7] * (e1[1] - e2[1]);
1343  }
1344  } else {
1345  for (size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8) {
1346  int ip = qlist[0];
1347  if (ip == -1) continue;
1348  int in = qlist[1];
1349  const Float* e1 = pe + ip * 2;
1350  const Float* e2 = pe + in * 2;
1351  v[0] += wq[0] * (e1[0] - e2[0]);
1352  v[7] += wq[1] * (e1[1] - e2[1]);
1353  }
1354  }
1355 }
1356 
1357 template <class Float>
1358 inline void JacobianOne(const Float* c, const Float* pt, const Float* ms,
1359  Float* jxc, Float* jyc, Float* jxp, Float* jyp,
1360  bool intrinsic_fixed, int radial_distortion) {
1361  const Float* r = c + 4;
1362  Float x0 = c[4] * pt[0] + c[5] * pt[1] + c[6] * pt[2];
1363  Float y0 = c[7] * pt[0] + c[8] * pt[1] + c[9] * pt[2];
1364  Float z0 = c[10] * pt[0] + c[11] * pt[1] + c[12] * pt[2];
1365  Float p2 = (z0 + c[3]);
1366  Float f_p2 = c[0] / p2;
1367  Float p0_p2 = (x0 + c[1]) / p2;
1368  Float p1_p2 = (y0 + c[2]) / p2;
1369 
1370  if (radial_distortion == 1) {
1371  Float rr1 = c[13] * p0_p2 * p0_p2;
1372  Float rr2 = c[13] * p1_p2 * p1_p2;
1373  Float f_p2_x = Float(f_p2 * (1.0 + 3.0 * rr1 + rr2));
1374  Float f_p2_y = Float(f_p2 * (1.0 + 3.0 * rr2 + rr1));
1375  if (jxc) {
1376 #ifndef PBA_DISABLE_CONST_CAMERA
1377  if (c[15] != 0.0f) {
1378  jxc[0] = 0;
1379  jxc[1] = 0;
1380  jxc[2] = 0;
1381  jxc[3] = 0;
1382  jxc[4] = 0;
1383  jxc[5] = 0;
1384  jxc[6] = 0;
1385  jxc[7] = 0;
1386  jyc[0] = 0;
1387  jyc[1] = 0;
1388  jyc[2] = 0;
1389  jyc[3] = 0;
1390  jyc[4] = 0;
1391  jyc[5] = 0;
1392  jyc[6] = 0;
1393  jyc[7] = 0;
1394  } else
1395 #endif
1396  {
1397  Float jfc = intrinsic_fixed ? 0 : Float(1.0 + rr1 + rr2);
1398  Float ft_x_pn =
1399  intrinsic_fixed ? 0 : c[0] * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
1401  jxc[0] = p0_p2 * jfc;
1402  jxc[1] = f_p2_x;
1403  jxc[2] = 0;
1404  jxc[3] = -f_p2_x * p0_p2;
1405  jxc[4] = -f_p2_x * p0_p2 * y0;
1406  jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
1407  jxc[6] = -f_p2_x * y0;
1408  jxc[7] = ft_x_pn * p0_p2;
1409 
1410  jyc[0] = p1_p2 * jfc;
1411  jyc[1] = 0;
1412  jyc[2] = f_p2_y;
1413  jyc[3] = -f_p2_y * p1_p2;
1414  jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
1415  jyc[5] = f_p2_y * x0 * p1_p2;
1416  jyc[6] = f_p2_y * x0;
1417  jyc[7] = ft_x_pn * p1_p2;
1418  }
1419  }
1420 
1422  if (jxp) {
1423  jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
1424  jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
1425  jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
1426  jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
1427  jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
1428  jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
1429 #ifdef POINT_DATA_ALIGN4
1430  jxp[3] = jyp[3] = 0;
1431 #endif
1432  }
1433  } else {
1434  if (jxc) {
1435 #ifndef PBA_DISABLE_CONST_CAMERA
1436  if (c[15] != 0.0f) {
1437  jxc[0] = 0;
1438  jxc[1] = 0;
1439  jxc[2] = 0;
1440  jxc[3] = 0;
1441  jxc[4] = 0;
1442  jxc[5] = 0;
1443  jxc[6] = 0;
1444  jxc[7] = 0;
1445  jyc[0] = 0;
1446  jyc[1] = 0;
1447  jyc[2] = 0;
1448  jyc[3] = 0;
1449  jyc[4] = 0;
1450  jyc[5] = 0;
1451  jyc[6] = 0;
1452  jyc[7] = 0;
1453  } else
1454 #endif
1455  {
1456  jxc[0] = intrinsic_fixed ? 0 : p0_p2;
1457  jxc[1] = f_p2;
1458  jxc[2] = 0;
1459  jxc[3] = -f_p2 * p0_p2;
1460  jxc[4] = -f_p2 * p0_p2 * y0;
1461  jxc[5] = f_p2 * (z0 + x0 * p0_p2);
1462  jxc[6] = -f_p2 * y0;
1463 
1464  jyc[0] = intrinsic_fixed ? 0 : p1_p2;
1465  jyc[1] = 0;
1466  jyc[2] = f_p2;
1467  jyc[3] = -f_p2 * p1_p2;
1468  jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
1469  jyc[5] = f_p2 * x0 * p1_p2;
1470  jyc[6] = f_p2 * x0;
1471 
1472  if (radial_distortion == -1 && !intrinsic_fixed) {
1473  Float msn = ms[0] * ms[0] + ms[1] * ms[1];
1474  jxc[7] = -ms[0] * msn;
1475  jyc[7] = -ms[1] * msn;
1476  } else {
1477  jxc[7] = 0;
1478  jyc[7] = 0;
1479  }
1480  }
1481  }
1483  if (jxp) {
1484  jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
1485  jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
1486  jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
1487  jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
1488  jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
1489  jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
1490 #ifdef POINT_DATA_ALIGN4
1491  jxp[3] = jyp[3] = 0;
1492 #endif
1493  }
1494  }
1495 }
1496 
1497 template <class Float>
1498 void ComputeJacobian(size_t nproj, size_t ncam, const Float* camera,
1499  const Float* point, Float* jc, Float* jp, const int* jmap,
1500  const Float* sj, const Float* ms, const int* cmlist,
1501  bool intrinsic_fixed, int radial_distortion, bool shuffle,
1502  Float* jct, int mt = 2, int i0 = 0);
1503 
1505 size_t nproj, ncam;
1506 const Float *camera, *point;
1507 Float *jc, *jp;
1508 const int* jmap;
1509 const Float *sj, *ms;
1510 const int* cmlist;
1511 bool intrinsic_fixed;
1512 int radial_distortion;
1513 bool shuffle;
1514 Float* jct;
1515 int i0;
1517 ComputeJacobian(q->nproj, q->ncam, q->camera, q->point, q->jc, q->jp, q->jmap,
1518  q->sj, q->ms, q->cmlist, q->intrinsic_fixed,
1519  q->radial_distortion, q->shuffle, q->jct, 0, q->i0);
1521 
1522 template <class Float>
1523 void ComputeJacobian(size_t nproj, size_t ncam, const Float* camera,
1524  const Float* point, Float* jc, Float* jp, const int* jmap,
1525  const Float* sj, const Float* ms, const int* cmlist,
1526  bool intrinsic_fixed, int radial_distortion, bool shuffle,
1527  Float* jct, int mt, int i0) {
1528  if (mt > 1 && nproj >= mt) {
1529  MYTHREAD threads[THREAD_NUM_MAX];
1530  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1531  for (size_t i = 0; i < thread_num; ++i) {
1532  size_t first = nproj * i / thread_num;
1533  size_t last_ = nproj * (i + 1) / thread_num;
1534  size_t last = std::min(last_, nproj);
1535  RUN_THREAD(ComputeJacobian, threads[i], last, ncam, camera, point, jc, jp,
1536  jmap + 2 * first, sj, ms + 2 * first, cmlist + first,
1537  intrinsic_fixed, radial_distortion, shuffle, jct, first);
1538  }
1539  WAIT_THREAD(threads, thread_num);
1540  } else {
1541  const Float* sjc0 = sj;
1542  const Float* sjp0 = sj ? sj + ncam * 8 : NULL;
1543 
1544  for (size_t i = i0; i < nproj; ++i, jmap += 2, ms += 2, ++cmlist) {
1545  int cidx = jmap[0], pidx = jmap[1];
1546  const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
1547  Float* jci = jc ? (jc + (shuffle ? cmlist[0] : i) * 16) : NULL;
1548  Float* jpi = jp ? (jp + i * POINT_ALIGN2) : NULL;
1549 
1551  JacobianOne(c, pt, ms, jci, jci + 8, jpi, jpi + POINT_ALIGN,
1552  intrinsic_fixed, radial_distortion);
1553 
1555  if (sjc0) {
1556  // jacobian scaling
1557  if (jci) {
1558  ScaleJ8(jci, jci + 8, sjc0 + cidx * 8);
1559  }
1560  if (jpi) {
1561  const Float* sjp = sjp0 + pidx * POINT_ALIGN;
1562  for (int j = 0; j < 3; ++j) {
1563  jpi[j] *= sjp[j];
1564  jpi[POINT_ALIGN + j] *= sjp[j];
1565  }
1566  }
1567  }
1568 
1569  if (jct && jc) MemoryCopyB(jci, jci + 16, jct + cmlist[0] * 16);
1570  }
1571  }
1572 }
1573 
1574 template <class Float>
1575 void ComputeDiagonalAddQ(size_t ncam, const Float* qw, Float* d,
1576  const Float* sj = NULL) {
1577  if (sj) {
1578  for (size_t i = 0; i < ncam; ++i, qw += 2, d += 8, sj += 8) {
1579  if (qw[0] == 0) continue;
1580  Float j1 = qw[0] * sj[0];
1581  Float j2 = qw[1] * sj[7];
1582  d[0] += (j1 * j1 * 2.0f);
1583  d[7] += (j2 * j2 * 2.0f);
1584  }
1585  } else {
1586  for (size_t i = 0; i < ncam; ++i, qw += 2, d += 8) {
1587  if (qw[0] == 0) continue;
1588  d[0] += (qw[0] * qw[0] * 2.0f);
1589  d[7] += (qw[1] * qw[1] * 2.0f);
1590  }
1591  }
1592 }
1593 
1595 template <class Float>
1596 void ComputeDiagonal(const avec<Float>& jcv, const vector<int>& cmapv,
1597  const avec<Float>& jpv, const vector<int>& pmapv,
1598  const vector<int>& cmlistv, const Float* qw0,
1599  avec<Float>& jtjdi, bool jc_transpose, int radial) {
1600  // first camera part
1601  if (jcv.size() == 0 || jpv.size() == 0) return; // not gonna happen
1602 
1603  size_t ncam = cmapv.size() - 1, npts = pmapv.size() - 1;
1604  const int vn = radial ? 8 : 7;
1605  SetVectorZero(jtjdi);
1606 
1607  const int* cmap = &cmapv[0];
1608  const int* pmap = &pmapv[0];
1609  const int* cmlist = &cmlistv[0];
1610  const Float* jc = &jcv[0];
1611  const Float* jp = &jpv[0];
1612  const Float* qw = qw0;
1613  Float* jji = &jtjdi[0];
1614 
1616  for (size_t i = 0; i < ncam; ++i, jji += 8, ++cmap, qw += 2) {
1617  int idx1 = cmap[0], idx2 = cmap[1];
1619  for (int j = idx1; j < idx2; ++j) {
1620  int idx = jc_transpose ? j : cmlist[j];
1621  const Float* pj = jc + idx * 16;
1623  for (int k = 0; k < vn; ++k)
1624  jji[k] += (pj[k] * pj[k] + pj[k + 8] * pj[k + 8]);
1625  }
1626  if (qw0 && qw[0] > 0) {
1627  jji[0] += (qw[0] * qw[0] * 2.0f);
1628  jji[7] += (qw[1] * qw[1] * 2.0f);
1629  }
1630  }
1631 
1632  for (size_t i = 0; i < npts; ++i, jji += POINT_ALIGN, ++pmap) {
1633  int idx1 = pmap[0], idx2 = pmap[1];
1634  const Float* pj = jp + idx1 * POINT_ALIGN2;
1635  for (int j = idx1; j < idx2; ++j, pj += POINT_ALIGN2) {
1636  for (int k = 0; k < 3; ++k)
1637  jji[k] += (pj[k] * pj[k] + pj[k + POINT_ALIGN] * pj[k + POINT_ALIGN]);
1638  }
1639  }
1640  Float* it = jtjdi.begin();
1641  for (; it < jtjdi.end(); ++it) {
1642  *it = (*it == 0) ? 0 : Float(1.0 / (*it));
1643  }
1644 }
1645 
1646 template <class T, int n, int m>
1647 void InvertSymmetricMatrix(T a[n][m], T ai[n][m]) {
1648  for (int i = 0; i < n; ++i) {
1649  if (a[i][i] > 0) {
1650  a[i][i] = sqrt(a[i][i]);
1651  for (int j = i + 1; j < n; ++j) a[j][i] = a[j][i] / a[i][i];
1652  for (int j = i + 1; j < n; ++j)
1653  for (int k = j; k < n; ++k) a[k][j] -= a[k][i] * a[j][i];
1654  }
1655  }
1657  // inv(L)
1658  for (int i = 0; i < n; ++i) {
1659  if (a[i][i] == 0) continue;
1660  a[i][i] = 1.0f / a[i][i];
1661  }
1662  for (int i = 1; i < n; ++i) {
1663  if (a[i][i] == 0) continue;
1664  for (int j = 0; j < i; ++j) {
1665  T sum = 0;
1666  for (int k = j; k < i; ++k) sum += (a[i][k] * a[k][j]);
1667  a[i][j] = -sum * a[i][i];
1668  }
1669  }
1671  // inv(L)' * inv(L)
1672  for (int i = 0; i < n; ++i) {
1673  for (int j = i; j < n; ++j) {
1674  ai[i][j] = 0;
1675  for (int k = j; k < n; ++k) ai[i][j] += a[k][i] * a[k][j];
1676  ai[j][i] = ai[i][j];
1677  }
1678  }
1679 }
1680 template <class T, int n, int m>
1681 void InvertSymmetricMatrix(T* a, T* ai) {
1682  InvertSymmetricMatrix<T, n, m>((T(*)[m])a, (T(*)[m])ai);
1683 }
1684 
1685 template <class Float>
1686 void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2,
1687  const Float* jc, const int* cmap, const int* cmlist,
1688  Float* di, Float* bi, int vn, bool jc_transpose,
1689  bool use_jq, int mt);
1690 
1692 size_t ncam;
1693 float lambda1, lambda2;
1694 const Float* jc;
1695 const int *cmap, *cmlist;
1696 Float *di, *bi;
1697 int vn;
1698 bool jc_transpose, use_jq;
1700 ComputeDiagonalBlockC(q->ncam, q->lambda1, q->lambda2, q->jc, q->cmap,
1701  q->cmlist, q->di, q->bi, q->vn, q->jc_transpose,
1702  q->use_jq, 0);
1704 
1705 template <class Float>
1706 void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2,
1707  const Float* jc, const int* cmap, const int* cmlist,
1708  Float* di, Float* bi, int vn, bool jc_transpose,
1709  bool use_jq, int mt) {
1710  const size_t bc = vn * 8;
1711 
1712  if (mt > 1 && ncam >= (size_t)mt) {
1713  MYTHREAD threads[THREAD_NUM_MAX];
1714  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1715  for (size_t i = 0; i < thread_num; ++i) {
1716  size_t first = ncam * i / thread_num;
1717  size_t last_ = ncam * (i + 1) / thread_num;
1718  size_t last = std::min(last_, ncam);
1719  RUN_THREAD(ComputeDiagonalBlockC, threads[i], (last - first), lambda1,
1720  lambda2, jc, cmap + first, cmlist, di + 8 * first,
1721  bi + bc * first, vn, jc_transpose, use_jq);
1722  }
1723  WAIT_THREAD(threads, thread_num);
1724  } else {
1725  Float bufv[64 + 8]; // size_t offset = ((size_t)bufv) & 0xf;
1726  // Float* pbuf = bufv + ((16 - offset) / sizeof(Float));
1727  Float* pbuf = (Float*)ALIGN_PTR(bufv);
1728 
1730  for (size_t i = 0; i < ncam; ++i, ++cmap, bi += bc) {
1731  int idx1 = cmap[0], idx2 = cmap[1];
1733  if (idx1 == idx2) {
1734  SetVectorZero(bi, bi + bc);
1735  } else {
1736  SetVectorZero(pbuf, pbuf + 64);
1737 
1738  for (int j = idx1; j < idx2; ++j) {
1739  int idx = jc_transpose ? j : cmlist[j];
1740  const Float* pj = jc + idx * 16;
1742  AddBlockJtJ(pj, pbuf, vn);
1743  AddBlockJtJ(pj + 8, pbuf, vn);
1744  }
1745 
1746  // change and copy the diagonal
1747 
1748  if (use_jq) {
1749  Float* pb = pbuf;
1750  for (int j = 0; j < 8; ++j, ++di, pb += 9) {
1751  Float temp;
1752  di[0] = temp = (di[0] + pb[0]);
1753  pb[0] = lambda2 * temp + lambda1;
1754  }
1755  } else {
1756  Float* pb = pbuf;
1757  for (int j = 0; j < 8; ++j, ++di, pb += 9) {
1758  *pb = lambda2 * ((*di) = (*pb)) + lambda1;
1759  }
1760  }
1761 
1762  // invert the matrix?
1763  if (vn == 8)
1764  InvertSymmetricMatrix<Float, 8, 8>(pbuf, bi);
1765  else
1766  InvertSymmetricMatrix<Float, 7, 8>(pbuf, bi);
1767  }
1768  }
1769  }
1770 }
1771 
1772 template <class Float>
1773 void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2,
1774  const Float* jp, const int* pmap, Float* di,
1775  Float* bi, int mt);
1776 
1778 size_t npt;
1779 float lambda1, lambda2;
1780 const Float* jp;
1781 const int* pmap;
1782 Float *di, *bi;
1784 ComputeDiagonalBlockP(q->npt, q->lambda1, q->lambda2, q->jp, q->pmap, q->di,
1785  q->bi, 0);
1787 
1788 template <class Float>
1789 void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2,
1790  const Float* jp, const int* pmap, Float* di,
1791  Float* bi, int mt) {
1792  if (mt > 1) {
1793  MYTHREAD threads[THREAD_NUM_MAX];
1794  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
1795  for (size_t i = 0; i < thread_num; ++i) {
1796  size_t first = npt * i / thread_num;
1797  size_t last_ = npt * (i + 1) / thread_num;
1798  size_t last = std::min(last_, npt);
1799  RUN_THREAD(ComputeDiagonalBlockP, threads[i], (last - first), lambda1,
1800  lambda2, jp, pmap + first, di + POINT_ALIGN * first,
1801  bi + 6 * first);
1802  }
1803  WAIT_THREAD(threads, thread_num);
1804  } else {
1805  for (size_t i = 0; i < npt; ++i, ++pmap, di += POINT_ALIGN, bi += 6) {
1806  int idx1 = pmap[0], idx2 = pmap[1];
1807 
1808  Float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
1809  const Float *jxp = jp + idx1 * (POINT_ALIGN2), *jyp = jxp + POINT_ALIGN;
1810  for (int j = idx1; j < idx2;
1811  ++j, jxp += POINT_ALIGN2, jyp += POINT_ALIGN2) {
1812  M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
1813  M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
1814  M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
1815  M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
1816  M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
1817  M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
1818  }
1819 
1821  di[0] = M00;
1822  di[1] = M11;
1823  di[2] = M22;
1824 
1826  M00 = M00 * lambda2 + lambda1;
1827  M11 = M11 * lambda2 + lambda1;
1828  M22 = M22 * lambda2 + lambda1;
1829 
1831  Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 -
1832  M02 * M02 * M11 - M12 * M12 * M00;
1833  if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
1834  // SetVectorZero(bi, bi + 6);
1835  for (int j = 0; j < 6; ++j) bi[j] = 0;
1836  } else {
1837  bi[0] = (M11 * M22 - M12 * M12) / det;
1838  bi[1] = -(M01 * M22 - M12 * M02) / det;
1839  bi[2] = (M01 * M12 - M02 * M11) / det;
1840  bi[3] = (M00 * M22 - M02 * M02) / det;
1841  bi[4] = -(M00 * M12 - M01 * M02) / det;
1842  bi[5] = (M00 * M11 - M01 * M01) / det;
1843  }
1844  }
1845  }
1846 }
1847 
1848 template <class Float>
1849 void ComputeDiagonalBlock(size_t ncam, size_t npts, float lambda, bool dampd,
1850  const Float* jc, const int* cmap, const Float* jp,
1851  const int* pmap, const int* cmlist, const Float* sj,
1852  const Float* wq, Float* diag, Float* blocks,
1853  int radial_distortion, bool jc_transpose, int mt1 = 2,
1854  int mt2 = 2, int mode = 0) {
1855  const int vn = radial_distortion ? 8 : 7;
1856  const size_t bc = vn * 8;
1857  float lambda1 = dampd ? 0.0f : lambda;
1858  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
1859 
1860  if (mode == 0) {
1861  const size_t bsz = bc * ncam + npts * 6;
1862  const size_t dsz = 8 * ncam + npts * POINT_ALIGN;
1863  bool use_jq = wq != NULL;
1865  SetVectorZero(blocks, blocks + bsz);
1866  SetVectorZero(diag, diag + dsz);
1867 
1869  if (use_jq) ComputeDiagonalAddQ(ncam, wq, diag, sj);
1870  ComputeDiagonalBlockC(ncam, lambda1, lambda2, jc, cmap, cmlist, diag,
1871  blocks, vn, jc_transpose, use_jq, mt1);
1872  ComputeDiagonalBlockP(npts, lambda1, lambda2, jp, pmap, diag + 8 * ncam,
1873  blocks + bc * ncam, mt2);
1874  } else if (mode == 1) {
1875  const size_t bsz = bc * ncam;
1876  const size_t dsz = 8 * ncam;
1877  bool use_jq = wq != NULL;
1879  SetVectorZero(blocks, blocks + bsz);
1880  SetVectorZero(diag, diag + dsz);
1881 
1883  if (use_jq) ComputeDiagonalAddQ(ncam, wq, diag, sj);
1884  ComputeDiagonalBlockC(ncam, lambda1, lambda2, jc, cmap, cmlist, diag,
1885  blocks, vn, jc_transpose, use_jq, mt1);
1886  } else {
1887  blocks += bc * ncam;
1888  diag += 8 * ncam;
1889  const size_t bsz = npts * 6;
1890  const size_t dsz = npts * POINT_ALIGN;
1892  SetVectorZero(blocks, blocks + bsz);
1893  SetVectorZero(diag, diag + dsz);
1894 
1896  ComputeDiagonalBlockP(npts, lambda1, lambda2, jp, pmap, diag, blocks, mt2);
1897  }
1898 }
1899 
1900 template <class Float>
1901 void ComputeDiagonalBlock_(float lambda, bool dampd, const avec<Float>& camerav,
1902  const avec<Float>& pointv, const avec<Float>& meas,
1903  const vector<int>& jmapv, const avec<Float>& sjv,
1904  avec<Float>& qwv, avec<Float>& diag,
1905  avec<Float>& blocks, bool intrinsic_fixed,
1906  int radial_distortion, int mode = 0) {
1907  const int vn = radial_distortion ? 8 : 7;
1908  const size_t szbc = vn * 8;
1909  size_t ncam = camerav.size() / 16;
1910  size_t npts = pointv.size() / POINT_ALIGN;
1911  size_t sz_jcd = ncam * 8;
1912  size_t sz_jcb = ncam * szbc;
1913  avec<Float> blockpv(blocks.size());
1914  SetVectorZero(blockpv);
1915  SetVectorZero(diag);
1917  float lambda1 = dampd ? 0.0f : lambda;
1918  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
1919 
1920  Float jbufv[24 + 8]; // size_t offset = ((size_t) jbufv) & 0xf;
1921  // Float* jxc = jbufv + ((16 - offset) / sizeof(Float));
1922  Float* jxc = (Float*)ALIGN_PTR(jbufv);
1923  Float *jyc = jxc + 8, *jxp = jxc + 16, *jyp = jxc + 20;
1924 
1926  const int* jmap = &jmapv[0];
1927  const Float* camera = &camerav[0];
1928  const Float* point = &pointv[0];
1929  const Float* ms = &meas[0];
1930  const Float* sjc0 = sjv.size() ? &sjv[0] : NULL;
1931  const Float* sjp0 = sjv.size() ? &sjv[sz_jcd] : NULL;
1933  Float *blockpc = &blockpv[0], *blockpp = &blockpv[sz_jcb];
1934  Float *bo = blockpc, *bi = &blocks[0], *di = &diag[0];
1935 
1937  // diagonal blocks
1938  for (size_t i = 0; i < jmapv.size(); i += 2, jmap += 2, ms += 2) {
1939  int cidx = jmap[0], pidx = jmap[1];
1940  const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
1942  JacobianOne(c, pt, ms, jxc, jyc, jxp, jyp, intrinsic_fixed,
1943  radial_distortion);
1944 
1946  if (mode != 2) {
1947  if (sjc0) {
1948  const Float* sjc = sjc0 + cidx * 8;
1949  ScaleJ8(jxc, jyc, sjc);
1950  }
1952  Float* bc = blockpc + cidx * szbc;
1953  AddBlockJtJ(jxc, bc, vn);
1954  AddBlockJtJ(jyc, bc, vn);
1955  }
1956 
1957  if (mode != 1) {
1958  if (sjp0) {
1959  const Float* sjp = sjp0 + pidx * POINT_ALIGN;
1960  jxp[0] *= sjp[0];
1961  jxp[1] *= sjp[1];
1962  jxp[2] *= sjp[2];
1963  jyp[0] *= sjp[0];
1964  jyp[1] *= sjp[1];
1965  jyp[2] *= sjp[2];
1966  }
1967 
1969  Float* bp = blockpp + pidx * 6;
1970  bp[0] += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
1971  bp[1] += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
1972  bp[2] += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
1973  bp[3] += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
1974  bp[4] += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
1975  bp[5] += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
1976  }
1977  }
1978 
1980  if (mode != 2) {
1982  const Float* qw = qwv.begin();
1983  if (qw) {
1984  for (size_t i = 0; i < ncam; ++i, qw += 2) {
1985  if (qw[0] == 0) continue;
1986  Float* bc = blockpc + i * szbc;
1987  if (sjc0) {
1988  const Float* sjc = sjc0 + i * 8;
1989  Float j1 = sjc[0] * qw[0];
1990  Float j2 = sjc[7] * qw[1];
1991  bc[0] += (j1 * j1 * 2.0f);
1992  if (radial_distortion) bc[63] += (j2 * j2 * 2.0f);
1993  } else {
1994  const Float* sjc = sjc0 + i * 8;
1995  bc[0] += (qw[0] * qw[0] * 2.0f);
1996  if (radial_distortion) bc[63] += (qw[1] * qw[1] * 2.0f);
1997  }
1998  }
1999  }
2000 
2001  for (size_t i = 0; i < ncam; ++i, bo += szbc, bi += szbc, di += 8) {
2002  Float *bp = bo, *dip = di;
2003  for (int j = 0; j < vn; ++j, ++dip, bp += 9) {
2004  dip[0] = bp[0];
2005  bp[0] = lambda2 * bp[0] + lambda1;
2006  }
2007 
2008  // invert the matrix?
2009  if (radial_distortion)
2010  InvertSymmetricMatrix<Float, 8, 8>(bo, bi);
2011  else
2012  InvertSymmetricMatrix<Float, 7, 8>(bo, bi);
2013  }
2014  } else {
2015  bo += szbc * ncam;
2016  bi += szbc * ncam;
2017  di += 8 * ncam;
2018  }
2019 
2021  // inverting the point part
2022  if (mode != 1) {
2023  for (size_t i = 0; i < npts; ++i, bo += 6, bi += 6, di += POINT_ALIGN) {
2024  Float &M00 = bo[0], &M01 = bo[1], &M02 = bo[2];
2025  Float &M11 = bo[3], &M12 = bo[4], &M22 = bo[5];
2026  di[0] = M00;
2027  di[1] = M11;
2028  di[2] = M22;
2029 
2031  M00 = M00 * lambda2 + lambda1;
2032  M11 = M11 * lambda2 + lambda1;
2033  M22 = M22 * lambda2 + lambda1;
2034 
2036  Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 -
2037  M02 * M02 * M11 - M12 * M12 * M00;
2038  if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
2039  for (int j = 0; j < 6; ++j) bi[j] = 0;
2040  } else {
2041  bi[0] = (M11 * M22 - M12 * M12) / det;
2042  bi[1] = -(M01 * M22 - M12 * M02) / det;
2043  bi[2] = (M01 * M12 - M02 * M11) / det;
2044  bi[3] = (M00 * M22 - M02 * M02) / det;
2045  bi[4] = -(M00 * M12 - M01 * M02) / det;
2046  bi[5] = (M00 * M11 - M01 * M01) / det;
2047  }
2048  }
2049  }
2050 }
2051 
2052 template <class Float>
2053 void MultiplyBlockConditionerC(int ncam, const Float* bi, const Float* x,
2054  Float* vx, int vn, int mt = 0);
2055 
2057 int ncam;
2058 const Float *bi, *x;
2059 Float* vx;
2060 int vn;
2062 MultiplyBlockConditionerC(q->ncam, q->bi, q->x, q->vx, q->vn, 0);
2064 
2065 template <class Float>
2066 void MultiplyBlockConditionerC(int ncam, const Float* bi, const Float* x,
2067  Float* vx, int vn, int mt) {
2068  if (mt > 1 && ncam >= mt) {
2069  const size_t bc = vn * 8;
2070  MYTHREAD threads[THREAD_NUM_MAX];
2071  const int thread_num = std::min(mt, THREAD_NUM_MAX);
2072  for (int i = 0; i < thread_num; ++i) {
2073  int first = ncam * i / thread_num;
2074  int last_ = ncam * (i + 1) / thread_num;
2075  int last = std::min(last_, ncam);
2076  RUN_THREAD(MultiplyBlockConditionerC, threads[i], (last - first),
2077  bi + first * bc, x + 8 * first, vx + 8 * first, vn);
2078  }
2079  WAIT_THREAD(threads, thread_num);
2080  } else {
2081  for (int i = 0; i < ncam; ++i, x += 8, vx += 8) {
2082  Float* vxc = vx;
2083  for (int j = 0; j < vn; ++j, bi += 8, ++vxc) *vxc = DotProduct8(bi, x);
2084  }
2085  }
2086 }
2087 
2088 template <class Float>
2089 void MultiplyBlockConditionerP(int npoint, const Float* bi, const Float* x,
2090  Float* vx, int mt = 0);
2091 
2093 int npoint;
2094 const Float *bi, *x;
2095 Float* vx;
2097 MultiplyBlockConditionerP(q->npoint, q->bi, q->x, q->vx, 0);
2099 
2100 template <class Float>
2101 void MultiplyBlockConditionerP(int npoint, const Float* bi, const Float* x,
2102  Float* vx, int mt) {
2103  if (mt > 1 && npoint >= mt) {
2104  MYTHREAD threads[THREAD_NUM_MAX];
2105  const int thread_num = std::min(mt, THREAD_NUM_MAX);
2106  for (int i = 0; i < thread_num; ++i) {
2107  int first = npoint * i / thread_num;
2108  int last_ = npoint * (i + 1) / thread_num;
2109  int last = std::min(last_, npoint);
2110  RUN_THREAD(MultiplyBlockConditionerP, threads[i], (last - first),
2111  bi + first * 6, x + POINT_ALIGN * first,
2112  vx + POINT_ALIGN * first);
2113  }
2114  WAIT_THREAD(threads, thread_num);
2115  } else {
2116  for (int i = 0; i < npoint;
2117  ++i, bi += 6, x += POINT_ALIGN, vx += POINT_ALIGN) {
2118  vx[0] = (bi[0] * x[0] + bi[1] * x[1] + bi[2] * x[2]);
2119  vx[1] = (bi[1] * x[0] + bi[3] * x[1] + bi[4] * x[2]);
2120  vx[2] = (bi[2] * x[0] + bi[4] * x[1] + bi[5] * x[2]);
2121  }
2122  }
2123 }
2124 
2125 template <class Float>
2126 void MultiplyBlockConditioner(int ncam, int npoint, const Float* blocksv,
2127  const Float* vec, Float* resultv, int radial,
2128  int mode, int mt1, int mt2) {
2129  const int vn = radial ? 8 : 7;
2130  if (mode != 2)
2131  MultiplyBlockConditionerC(ncam, blocksv, vec, resultv, vn, mt1);
2132  if (mt2 == 0) mt2 = AUTO_MT_NUM(npoint * 24);
2133  if (mode != 1)
2134  MultiplyBlockConditionerP(npoint, blocksv + (vn * 8 * ncam), vec + ncam * 8,
2135  resultv + 8 * ncam, mt2);
2136 }
2137 
2138 template <class Float>
2139 void ComputeJX(size_t nproj, size_t ncam, const Float* x, const Float* jc,
2140  const Float* jp, const int* jmap, Float* jx, int mode,
2141  int mt = 2);
2142 
2144 size_t nproj, ncam;
2145 const Float *xc, *jc, *jp;
2146 const int* jmap;
2147 Float* jx;
2148 int mode;
2150 ComputeJX(q->nproj, q->ncam, q->xc, q->jc, q->jp, q->jmap, q->jx, q->mode, 0);
2152 
2153 template <class Float>
2154 void ComputeJX(size_t nproj, size_t ncam, const Float* x, const Float* jc,
2155  const Float* jp, const int* jmap, Float* jx, int mode, int mt) {
2156  if (mt > 1 && nproj >= mt) {
2157  MYTHREAD threads[THREAD_NUM_MAX];
2158  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
2159  for (size_t i = 0; i < thread_num; ++i) {
2160  size_t first = nproj * i / thread_num;
2161  size_t last_ = nproj * (i + 1) / thread_num;
2162  size_t last = std::min(last_, nproj);
2163  RUN_THREAD(ComputeJX, threads[i], (last - first), ncam, x,
2164  jc + 16 * first, jp + POINT_ALIGN2 * first, jmap + first * 2,
2165  jx + first * 2, mode);
2166  }
2167  WAIT_THREAD(threads, thread_num);
2168  } else if (mode == 0) {
2169  const Float *pxc = x, *pxp = pxc + ncam * 8;
2170  // clock_t tp = clock(); double s1 = 0, s2 = 0;
2171  for (size_t i = 0; i < nproj;
2172  ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) {
2173  ComputeTwoJX(jc, jp, pxc + jmap[0] * 8, pxp + jmap[1] * POINT_ALIGN, jx);
2174  }
2175  } else if (mode == 1) {
2176  const Float* pxc = x;
2177  // clock_t tp = clock(); double s1 = 0, s2 = 0;
2178  for (size_t i = 0; i < nproj;
2179  ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) {
2180  const Float* xc = pxc + jmap[0] * 8;
2181  jx[0] = DotProduct8(jc, xc);
2182  jx[1] = DotProduct8(jc + 8, xc);
2183  }
2184  } else if (mode == 2) {
2185  const Float* pxp = x + ncam * 8;
2186  // clock_t tp = clock(); double s1 = 0, s2 = 0;
2187  for (size_t i = 0; i < nproj;
2188  ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) {
2189  const Float* xp = pxp + jmap[1] * POINT_ALIGN;
2190  jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
2191  jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
2192  }
2193  }
2194 }
2195 
2196 template <class Float>
2197 void ComputeJX_(size_t nproj, size_t ncam, const Float* x, Float* jx,
2198  const Float* camera, const Float* point, const Float* ms,
2199  const Float* sj, const int* jmap, bool intrinsic_fixed,
2200  int radial_distortion, int mode, int mt = 16);
2201 
2203 size_t nproj, ncam;
2204 const Float* x;
2205 Float* jx;
2206 const Float *camera, *point, *ms, *sj;
2207 const int* jmap;
2208 bool intrinsic_fixed;
2209 int radial_distortion;
2210 int mode;
2212 ComputeJX_(q->nproj, q->ncam, q->x, q->jx, q->camera, q->point, q->ms, q->sj,
2213  q->jmap, q->intrinsic_fixed, q->radial_distortion, q->mode, 0);
2215 
2216 template <class Float>
2217 void ComputeJX_(size_t nproj, size_t ncam, const Float* x, Float* jx,
2218  const Float* camera, const Float* point, const Float* ms,
2219  const Float* sj, const int* jmap, bool intrinsic_fixed,
2220  int radial_distortion, int mode, int mt) {
2221  if (mt > 1 && nproj >= mt) {
2222  MYTHREAD threads[THREAD_NUM_MAX];
2223  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
2224  for (size_t i = 0; i < thread_num; ++i) {
2225  size_t first = nproj * i / thread_num;
2226  size_t last_ = nproj * (i + 1) / thread_num;
2227  size_t last = std::min(last_, nproj);
2228  RUN_THREAD(ComputeJX_, threads[i], (last - first), ncam, x,
2229  jx + first * 2, camera, point, ms + 2 * first, sj,
2230  jmap + first * 2, intrinsic_fixed, radial_distortion, mode);
2231  }
2232  WAIT_THREAD(threads, thread_num);
2233  } else if (mode == 0) {
2234  Float jcv[24 + 8]; // size_t offset = ((size_t) jcv) & 0xf;
2235  // Float* jc = jcv + (16 - offset) / sizeof(Float), *jp = jc + 16;
2236  Float *jc = (Float *)ALIGN_PTR(jcv), *jp = jc + 16;
2238  const Float* sjc = sj;
2239  const Float* sjp = sjc ? (sjc + ncam * 8) : NULL;
2240  const Float *xc0 = x, *xp0 = x + ncam * 8;
2241 
2243  for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
2244  const int cidx = jmap[0], pidx = jmap[1];
2245  const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
2247  JacobianOne(c, pt, ms, jc, jc + 8, jp, jp + POINT_ALIGN, intrinsic_fixed,
2248  radial_distortion);
2249  if (sjc) {
2250  // jacobian scaling
2251  ScaleJ8(jc, jc + 8, sjc + cidx * 8);
2252  const Float* sjpi = sjp + pidx * POINT_ALIGN;
2253  for (int j = 0; j < 3; ++j) {
2254  jp[j] *= sjpi[j];
2255  jp[POINT_ALIGN + j] *= sjpi[j];
2256  }
2257  }
2259  ComputeTwoJX(jc, jp, xc0 + cidx * 8, xp0 + pidx * POINT_ALIGN, jx);
2260  }
2261  } else if (mode == 1) {
2262  Float jcv[24 + 8]; // size_t offset = ((size_t) jcv) & 0xf;
2263  // Float* jc = jcv + (16 - offset) / sizeof(Float);
2264  Float* jc = (Float*)ALIGN_PTR(jcv);
2265 
2267  const Float *sjc = sj, *xc0 = x;
2268 
2270  for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
2271  const int cidx = jmap[0], pidx = jmap[1];
2272  const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
2274  JacobianOne(c, pt, ms, jc, jc + 8, (Float*)NULL, (Float*)NULL,
2275  intrinsic_fixed, radial_distortion);
2276  if (sjc) ScaleJ8(jc, jc + 8, sjc + cidx * 8);
2277  const Float* xc = xc0 + cidx * 8;
2278  jx[0] = DotProduct8(jc, xc);
2279  jx[1] = DotProduct8(jc + 8, xc);
2280  }
2281  } else if (mode == 2) {
2282  Float jp[8];
2283 
2285  const Float* sjp = sj ? (sj + ncam * 8) : NULL;
2286  const Float* xp0 = x + ncam * 8;
2287 
2289  for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
2290  const int cidx = jmap[0], pidx = jmap[1];
2291  const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
2293  JacobianOne(c, pt, ms, (Float*)NULL, (Float*)NULL, jp, jp + POINT_ALIGN,
2294  intrinsic_fixed, radial_distortion);
2295 
2296  const Float* xp = xp0 + pidx * POINT_ALIGN;
2297  if (sjp) {
2298  const Float* s = sjp + pidx * POINT_ALIGN;
2299  jx[0] = (jp[0] * xp[0] * s[0] + jp[1] * xp[1] * s[1] +
2300  jp[2] * xp[2] * s[2]);
2301  jx[1] = (jp[3] * xp[0] * s[0] + jp[4] * xp[1] * s[1] +
2302  jp[5] * xp[2] * s[2]);
2303  } else {
2304  jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
2305  jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
2306  }
2307  }
2308  }
2309 }
2310 
2311 template <class Float>
2312 void ComputeJtEC(size_t ncam, const Float* pe, const Float* jc, const int* cmap,
2313  const int* cmlist, Float* v, bool jc_transpose, int mt);
2314 
2316 size_t ncam;
2317 const Float *pe, *jc;
2318 const int *cmap, *cmlist;
2319 Float* v;
2320 bool jc_transpose;
2322 ComputeJtEC(q->ncam, q->pe, q->jc, q->cmap, q->cmlist, q->v, q->jc_transpose,
2323  0);
2325 
2326 template <class Float>
2327 void ComputeJtEC(size_t ncam, const Float* pe, const Float* jc, const int* cmap,
2328  const int* cmlist, Float* v, bool jc_transpose, int mt) {
2329  if (mt > 1 && ncam >= mt) {
2330  MYTHREAD threads[THREAD_NUM_MAX]; // if(ncam < mt) mt = ncam;
2331  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
2332  for (size_t i = 0; i < thread_num; ++i) {
2333  size_t first = ncam * i / thread_num;
2334  size_t last_ = ncam * (i + 1) / thread_num;
2335  size_t last = std::min(last_, ncam);
2336  RUN_THREAD(ComputeJtEC, threads[i], (last - first), pe, jc, cmap + first,
2337  cmlist, v + 8 * first, jc_transpose);
2338  }
2339  WAIT_THREAD(threads, thread_num);
2340  } else {
2342  for (size_t i = 0; i < ncam; ++i, ++cmap, v += 8) {
2343  int idx1 = cmap[0], idx2 = cmap[1];
2344  for (int j = idx1; j < idx2; ++j) {
2345  int edx = cmlist[j];
2346  const Float* pj = jc + ((jc_transpose ? j : edx) * 16);
2347  const Float* e = pe + edx * 2;
2349  AddScaledVec8(e[0], pj, v);
2350  AddScaledVec8(e[1], pj + 8, v);
2351  }
2352  }
2353  }
2354 }
2355 
2356 template <class Float>
2357 void ComputeJtEP(size_t npt, const Float* pe, const Float* jp, const int* pmap,
2358  Float* v, int mt);
2359 
2361 size_t npt;
2362 const Float *pe, *jp;
2363 const int* pmap;
2364 Float* v;
2366 ComputeJtEP(q->npt, q->pe, q->jp, q->pmap, q->v, 0);
2368 
2369 template <class Float>
2370 void ComputeJtEP(size_t npt, const Float* pe, const Float* jp, const int* pmap,
2371  Float* v, int mt) {
2372  if (mt > 1 && npt >= mt) {
2373  MYTHREAD threads[THREAD_NUM_MAX];
2374  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
2375  for (size_t i = 0; i < thread_num; ++i) {
2376  size_t first = npt * i / thread_num;
2377  size_t last_ = npt * (i + 1) / thread_num;
2378  size_t last = std::min(last_, npt);
2379  RUN_THREAD(ComputeJtEP, threads[i], (last - first), pe, jp, pmap + first,
2380  v + POINT_ALIGN * first);
2381  }
2382  WAIT_THREAD(threads, thread_num);
2383  } else {
2384  for (size_t i = 0; i < npt; ++i, ++pmap, v += POINT_ALIGN) {
2385  int idx1 = pmap[0], idx2 = pmap[1];
2386  const Float* pj = jp + idx1 * POINT_ALIGN2;
2387  const Float* e = pe + idx1 * 2;
2388  Float temp[3] = {0, 0, 0};
2389  for (int j = idx1; j < idx2; ++j, pj += POINT_ALIGN2, e += 2) {
2390  temp[0] += (e[0] * pj[0] + e[1] * pj[POINT_ALIGN]);
2391  temp[1] += (e[0] * pj[1] + e[1] * pj[POINT_ALIGN + 1]);
2392  temp[2] += (e[0] * pj[2] + e[1] * pj[POINT_ALIGN + 2]);
2393  }
2394  v[0] = temp[0];
2395  v[1] = temp[1];
2396  v[2] = temp[2];
2397  }
2398  }
2399 }
2400 
2401 template <class Float>
2402 void ComputeJtE(size_t ncam, size_t npt, const Float* pe, const Float* jc,
2403  const int* cmap, const int* cmlist, const Float* jp,
2404  const int* pmap, Float* v, bool jc_transpose, int mode, int mt1,
2405  int mt2) {
2406  if (mode != 2) {
2407  SetVectorZero(v, v + ncam * 8);
2408  ComputeJtEC(ncam, pe, jc, cmap, cmlist, v, jc_transpose, mt1);
2409  }
2410  if (mode != 1) {
2411  ComputeJtEP(npt, pe, jp, pmap, v + 8 * ncam, mt2);
2412  }
2413 }
2414 
2415 template <class Float>
2416 void ComputeJtEC_(size_t ncam, const Float* ee, Float* jte, const Float* c,
2417  const Float* point, const Float* ms, const int* jmap,
2418  const int* cmap, const int* cmlist, bool intrinsic_fixed,
2419  int radial_distortion, int mt);
2420 
2422 size_t ncam;
2423 const Float* ee;
2424 Float* jte;
2425 const Float *c, *point, *ms;
2426 const int *jmap, *cmap, *cmlist;
2427 bool intrinsic_fixed;
2428 int radial_distortion;
2430 ComputeJtEC_(q->ncam, q->ee, q->jte, q->c, q->point, q->ms, q->jmap, q->cmap,
2431  q->cmlist, q->intrinsic_fixed, q->radial_distortion, 0);
2433 
2434 template <class Float>
2435 void ComputeJtEC_(size_t ncam, const Float* ee, Float* jte, const Float* c,
2436  const Float* point, const Float* ms, const int* jmap,
2437  const int* cmap, const int* cmlist, bool intrinsic_fixed,
2438  int radial_distortion, int mt) {
2439  if (mt > 1 && ncam >= mt) {
2440  MYTHREAD threads[THREAD_NUM_MAX];
2441  // if(ncam < mt) mt = ncam;
2442  const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
2443  for (size_t i = 0; i < thread_num; ++i) {
2444  size_t first = ncam * i / thread_num;
2445  size_t last_ = ncam * (i + 1) / thread_num;
2446  size_t last = std::min(last_, ncam);
2447  RUN_THREAD(ComputeJtEC_, threads[i], (last - first), ee, jte + 8 * first,
2448  c + first * 16, point, ms, jmap, cmap + first, cmlist,
2449  intrinsic_fixed, radial_distortion);
2450  }
2451  WAIT_THREAD(threads, thread_num);
2452 
2453  } else {
2455  Float jcv[16 + 8]; // size_t offset = ((size_t) jcv) & 0xf;
2456  // Float* jcx = jcv + ((16 - offset) / sizeof(Float)), * jcy = jcx + 8;
2457  Float *jcx = (Float *)ALIGN_PTR(jcv), *jcy = jcx + 8;
2458 
2459  for (size_t i = 0; i < ncam; ++i, ++cmap, jte += 8, c += 16) {
2460  int idx1 = cmap[0], idx2 = cmap[1];
2461 
2462  for (int j = idx1; j < idx2; ++j) {
2463  int index = cmlist[j];
2464  const Float* pt = point + jmap[2 * index + 1] * POINT_ALIGN;
2465  const Float* e = ee + index * 2;
2466 
2467  JacobianOne(c, pt, ms + index * 2, jcx, jcy, (Float*)NULL, (Float*)NULL,
2468  intrinsic_fixed, radial_distortion);
2469 
2471  AddScaledVec8(e[0], jcx, jte);
2472  AddScaledVec8(e[1], jcy, jte);
2473  }
2474  }
2475  }
2476 }
2477 
2478 template <class Float>
2479 void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float* ee,
2480  Float* jte, const Float* camera, const Float* point,
2481  const Float* ms, const int* jmap, const int* cmap,
2482  const int* cmlist, const int* pmap, const Float* jp,
2483  bool intrinsic_fixed, int radial_distortion, int mode,
2484  int mt) {
2485  if (mode != 2) {
2486  SetVectorZero(jte, jte + ncam * 8);
2487  ComputeJtEC_(ncam, ee, jte, camera, point, ms, jmap, cmap, cmlist,
2488  intrinsic_fixed, radial_distortion, mt);
2489  }
2490  if (mode != 1) {
2491  ComputeJtEP(npt, ee, jp, pmap, jte + 8 * ncam, mt);
2492  }
2493 }
2494 
2495 template <class Float>
2496 void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float* ee,
2497  Float* jte, const Float* camera, const Float* point,
2498  const Float* ms, const int* jmap, bool intrinsic_fixed,
2499  int radial_distortion, int mode) {
2500  SetVectorZero(jte, jte + (ncam * 8 + npt * POINT_ALIGN));
2501  Float jcv[24 + 8]; // size_t offset = ((size_t) jcv) & 0xf;
2502  // Float* jc = jcv + (16 - offset) / sizeof(Float), *pj = jc + 16;
2503  Float *jc = (Float *)ALIGN_PTR(jcv), *pj = jc + 16;
2504 
2505  Float *vc0 = jte, *vp0 = jte + ncam * 8;
2506 
2507  for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, ee += 2) {
2508  int cidx = jmap[0], pidx = jmap[1];
2509  const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
2510 
2511  if (mode == 0) {
2513  JacobianOne(c, pt, ms, jc, jc + 8, pj, pj + POINT_ALIGN, intrinsic_fixed,
2514  radial_distortion);
2515 
2517  Float *vc = vc0 + cidx * 8, *vp = vp0 + pidx * POINT_ALIGN;
2518  AddScaledVec8(ee[0], jc, vc);
2519  AddScaledVec8(ee[1], jc + 8, vc);
2520  vp[0] += (ee[0] * pj[0] + ee[1] * pj[POINT_ALIGN]);
2521  vp[1] += (ee[0] * pj[1] + ee[1] * pj[POINT_ALIGN + 1]);
2522  vp[2] += (ee[0] * pj[2] + ee[1] * pj[POINT_ALIGN + 2]);
2523  } else if (mode == 1) {
2525  JacobianOne(c, pt, ms, jc, jc + 8, (Float*)NULL, (Float*)NULL,
2526  intrinsic_fixed, radial_distortion);
2527 
2529  Float* vc = vc0 + cidx * 8;
2530  AddScaledVec8(ee[0], jc, vc);
2531  AddScaledVec8(ee[1], jc + 8, vc);
2532  } else {
2534  JacobianOne(c, pt, ms, (Float*)NULL, (Float*)NULL, pj, pj + POINT_ALIGN,
2535  intrinsic_fixed, radial_distortion);
2536 
2538  Float* vp = vp0 + pidx * POINT_ALIGN;
2539  vp[0] += (ee[0] * pj[0] + ee[1] * pj[POINT_ALIGN]);
2540  vp[1] += (ee[0] * pj[1] + ee[1] * pj[POINT_ALIGN + 1]);
2541  vp[2] += (ee[0] * pj[2] + ee[1] * pj[POINT_ALIGN + 2]);
2542  }
2543  }
2544 }
2545 };
2546 
2547 using namespace ProgramCPU;
2548 
2549 template <class Float>
2551  : ParallelBA(PBA_INVALID_DEVICE),
2552  _num_camera(0),
2553  _num_point(0),
2554  _num_imgpt(0),
2555  _num_imgpt_q(0),
2556  _camera_data(NULL),
2557  _point_data(NULL),
2558  _imgpt_data(NULL),
2559  _camera_idx(NULL),
2560  _point_idx(NULL),
2561  _projection_sse(0) {
2562  __cpu_data_precision = sizeof(Float);
2563  if (num_threads <= 0) {
2564  __num_cpu_cores = FindProcessorCoreNum();
2565  } else {
2566  __num_cpu_cores = num_threads;
2567  }
2568  if (__verbose_level)
2569  std::cout << "CPU " << (__cpu_data_precision == 4 ? "single" : "double")
2570  << "-precision solver; " << __num_cpu_cores << " cores"
2571 #ifdef CPUPBA_USE_AVX
2572  << " (AVX)"
2573 #endif
2574  << ".\n";
2575  // the following configuration are totally based my personal experience
2576  // on two computers.. you should adjust them according to your system.
2577  // try run driver filename -profile --float to see how speed varies
2590 
2592  __multiply_jx_usenoj = false;
2593 
2595  // To get the best performance, you should ajust the number of threads
2596  // Linux and Windows may also have different thread launching overhead.
2597 
2602 
2605  1; // single thread always faster with my experience
2606 
2607  // see the AUTO_MT_NUM marcro for definition
2608  __num_cpu_thread[FUNC_MPP] = 0; // automatically chosen according to size
2609  __num_cpu_thread[FUNC_VS] = 0; // automatically chosen according to size
2610  __num_cpu_thread[FUNC_VV] = 0; // automatically chosen accodring to size
2611 }
2612 
2613 template <class Float>
2615  if (sizeof(CameraT) != 16 * sizeof(float)) return; // never gonna happen...?
2616  _num_camera = (int)ncam;
2617  _camera_data = cams;
2618  _focal_mask = NULL;
2619 }
2620 
2621 template <class Float>
2622 void SparseBundleCPU<Float>::SetFocalMask(const int* fmask, float weight) {
2623  _focal_mask = fmask;
2624  _weight_q = weight;
2625 }
2626 
2627 template <class Float>
2629  _num_point = (int)npoint;
2630  _point_data = (float*)pts;
2631 }
2632 
2633 template <class Float>
2634 void SparseBundleCPU<Float>::SetProjection(size_t nproj, const Point2D* imgpts,
2635  const int* point_idx,
2636  const int* cam_idx) {
2637  _num_imgpt = (int)nproj;
2638  _imgpt_data = (float*)imgpts;
2639  _camera_idx = cam_idx;
2640  _point_idx = point_idx;
2641 }
2642 
2643 template <class Float>
2645  return float(_projection_sse /
2646  (_num_imgpt * __focal_scaling * __focal_scaling));
2647 }
2648 
2649 template <class Float>
2651  ResetBundleStatistics();
2652  BundleAdjustment();
2653  if (__num_lm_success > 0)
2654  SaveBundleStatistics(_num_camera, _num_point, _num_imgpt);
2655  if (__num_lm_success > 0) PrintBundleStatistics();
2656  ResetTemporarySetting();
2657  return __num_lm_success;
2658 }
2659 
2660 template <class Float>
2662  if (_camera_data == NULL) return STATUS_CAMERA_MISSING;
2663  if (_point_data == NULL) return STATUS_POINT_MISSING;
2664  if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING;
2665  if (_camera_idx == NULL || _point_idx == NULL)
2666  return STATUS_PROJECTION_MISSING;
2667  return STATUS_SUCCESS;
2668 }
2669 
2670 template <class Float>
2673  TimerBA timer(this, TIMER_GPU_ALLOCATION);
2674  InitializeStorageForSFM();
2675  InitializeStorageForCG();
2676 
2677  if (__debug_pba) DumpCooJacobian();
2678 
2679  return STATUS_SUCCESS;
2680 }
2681 
2682 template <class Float>
2684  return _num_camera * 8 + POINT_ALIGN * _num_point;
2685 }
2686 
2687 template <class Float>
2689  if (ValidateInputData() != STATUS_SUCCESS) return;
2690 
2692  TimerBA timer(this, TIMER_OVERALL);
2693 
2694  NormalizeData();
2695  if (InitializeBundle() != STATUS_SUCCESS) {
2696  // failed to allocate gpu storage
2697  } else if (__profile_pba) {
2698  // profiling some stuff
2699  RunProfileSteps();
2700  } else {
2701  // real optimization
2702  AdjustBundleAdjsutmentMode();
2703  NonlinearOptimizeLM();
2704  TransferDataToHost();
2705  }
2706  DenormalizeData();
2707 }
2708 
2709 template <class Float>
2711  TimerBA timer(this, TIMER_PREPROCESSING);
2712  NormalizeDataD();
2713  NormalizeDataF();
2714 }
2715 
2716 template <class Float>
2718  TimerBA timer(this, TIMER_GPU_DOWNLOAD);
2719  std::copy(_cuCameraData.begin(), _cuCameraData.end(), ((float*)_camera_data));
2720 #ifdef POINT_DATA_ALIGN4
2721  std::copy(_cuPointData.begin(), _cuPointData.end(), _point_data);
2722 #else
2723  for (size_t i = 0, j = 0; i < _cuPointData.size(); j++) {
2724  _point_data[j++] = (float)_cuPointData[i++];
2725  _point_data[j++] = (float)_cuPointData[i++];
2726  _point_data[j++] = (float)_cuPointData[i++];
2727  }
2728 #endif
2729 }
2730 
2731 #define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \
2732  { \
2733  NAME.resize((num) * (channels)); \
2734  total_sz += NAME.size() * sizeof(Float); \
2735  }
2736 #define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \
2737  if (option) ALLOCATE_REQUIRED_DATA(NAME, num, channels) else { \
2738  NAME.resize(0); \
2739  }
2741 template <class Float>
2743  size_t total_sz = 0;
2745  ProcessIndexCameraQ(_cuCameraQMap, _cuCameraQList);
2746  total_sz += ((_cuCameraQMap.size() + _cuCameraQList.size()) * sizeof(int) /
2747  1024 / 1024);
2748 
2750  ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, POINT_ALIGN); // 4n
2751  ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16); // 16m
2752  ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16); // 16m
2753 
2755  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1); // m
2756  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1); // k
2757  ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1); // n
2758  ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2); // 2k
2759  ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2); // 2k
2760  ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, POINT_ALIGN); // 4n
2761  ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2); // 2k
2762  ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2);
2763  ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0),
2764  2);
2765 
2766  ALLOCATE_OPTIONAL_DATA(_cuJacobianPoint, _num_imgpt * 2, POINT_ALIGN,
2767  !__no_jacobian_store); // 8k
2768  ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8,
2769  !__no_jacobian_store && __jc_store_transpose); // 16k
2770  ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8,
2771  !__no_jacobian_store && __jc_store_original); // 16k
2772  ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1,
2773  __jc_store_transpose); // k
2774 
2776  BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
2778  vector<int>& cpi = _cuCameraMeasurementMap;
2779  cpi.resize(_num_camera + 1);
2780  vector<int>& cpidx = _cuCameraMeasurementList;
2781  cpidx.resize(_num_imgpt);
2782  vector<int> cpnum(_num_camera, 0);
2783  cpi[0] = 0;
2784  for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++;
2785  for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1];
2787  vector<int> cptidx = cpi;
2788  for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i;
2789 
2791  if (_cuCameraMeasurementListT.size()) {
2792  vector<int>& ridx = _cuCameraMeasurementListT;
2793  ridx.resize(_num_imgpt);
2794  for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i;
2795  }
2796 
2799  if (_num_imgpt_q > 0)
2800  ProcessWeightCameraQ(cpnum, _cuCameraQMap, _cuCameraQMapW.begin(),
2801  _cuCameraQListW.begin());
2802 
2804  std::copy((float*)_camera_data, ((float*)_camera_data) + _cuCameraData.size(),
2805  _cuCameraData.begin());
2806 
2807 #ifdef POINT_DATA_ALIGN4
2808  std::copy(_point_data, _point_data + _cuPointData.size(),
2809  _cuPointData.begin());
2810 #else
2811  for (size_t i = 0, j = 0; i < _cuPointData.size(); j++) {
2812  _cuPointData[i++] = _point_data[j++];
2813  _cuPointData[i++] = _point_data[j++];
2814  _cuPointData[i++] = _point_data[j++];
2815  }
2816 #endif
2817 
2820  vector<int>& ppi = _cuPointMeasurementMap;
2821  ppi.resize(_num_point + 1);
2822  for (int i = 0, last_point = -1; i < _num_imgpt; ++i) {
2823  int pt = _point_idx[i];
2824  while (last_point < pt) ppi[++last_point] = i;
2825  }
2826  ppi[_num_point] = _num_imgpt;
2827 
2829  vector<int>& pmp = _cuProjectionMap;
2830  pmp.resize(_num_imgpt * 2);
2831  for (int i = 0; i < _num_imgpt; ++i) {
2832  int* imp = &pmp[i * 2];
2833  imp[0] = _camera_idx[i];
2834  imp[1] = _point_idx[i];
2835  }
2836  BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
2838 
2839  __memory_usage = total_sz;
2840  if (__verbose_level > 1)
2841  std::cout << "Memory for Motion/Structure/Jacobian:\t"
2842  << (total_sz / 1024 / 1024) << "MB\n";
2843 
2844  return true;
2845 }
2846 
2847 template <class Float>
2849  vector<int>& qlist) {
2851  qlist.resize(0);
2852  qmap.resize(0);
2853  _num_imgpt_q = 0;
2854 
2855  if (_camera_idx == NULL) return true;
2856  if (_point_idx == NULL) return true;
2857  if (_focal_mask == NULL) return true;
2858  if (_num_camera == 0) return true;
2859  if (_weight_q <= 0) return true;
2860 
2862 
2863  int error = 0;
2864  vector<int> temp(_num_camera * 2, -1);
2865 
2866  for (int i = 0; i < _num_camera; ++i) {
2867  int iq = _focal_mask[i];
2868  if (iq > i) {
2869  error = 1;
2870  break;
2871  }
2872  if (iq < 0) continue;
2873  if (iq == i) continue;
2874  int ip = temp[2 * iq];
2875  // float ratio = _camera_data[i].f / _camera_data[iq].f;
2876  // if(ratio < 0.01 || ratio > 100)
2877  //{
2878  // std::cout << "Warning: constaraints on largely different camreas\n";
2879  // continue;
2880  //}else
2881  if (_focal_mask[iq] != iq) {
2882  error = 1;
2883  break;
2884  } else if (ip == -1) {
2885  temp[2 * iq] = i;
2886  temp[2 * iq + 1] = i;
2887  temp[2 * i] = iq;
2888  temp[2 * i + 1] = iq;
2889  } else {
2890  // maintain double-linked list
2891  temp[2 * i] = ip;
2892  temp[2 * i + 1] = iq;
2893  temp[2 * ip + 1] = i;
2894  temp[2 * iq] = i;
2895  }
2896  }
2897 
2898  if (error) {
2899  std::cout << "Error: incorrect constraints\n";
2900  _focal_mask = NULL;
2901  return false;
2902  }
2903 
2905  qlist.resize(_num_camera * 2, -1);
2906  for (int i = 0; i < _num_camera; ++i) {
2907  int inext = temp[2 * i + 1];
2908  if (inext == -1) continue;
2909  qlist[2 * i] = _num_imgpt_q;
2910  qlist[2 * inext + 1] = _num_imgpt_q;
2911  qmap.push_back(i);
2912  qmap.push_back(inext);
2913  _num_imgpt_q++;
2914  }
2915  return true;
2916 }
2917 
2918 template <class Float>
2920  vector<int>& qmap,
2921  Float* qmapw, Float* qlistw) {
2922  // set average focal length and average radial distortion
2923  vector<Float> qpnum(_num_camera, 0), qcnum(_num_camera, 0);
2924  vector<Float> fs(_num_camera, 0), rs(_num_camera, 0);
2925 
2926  for (int i = 0; i < _num_camera; ++i) {
2927  int qi = _focal_mask[i];
2928  if (qi == -1) continue;
2929  // float ratio = _camera_data[i].f / _camera_data[qi].f;
2930  // if(ratio < 0.01 || ratio > 100) continue;
2931  fs[qi] += _camera_data[i].f;
2932  rs[qi] += _camera_data[i].radial;
2933  qpnum[qi] += cpnum[i];
2934  qcnum[qi] += 1.0f;
2935  }
2936 
2937  // this seems not really matter..they will converge anyway
2938  for (int i = 0; i < _num_camera; ++i) {
2939  int qi = _focal_mask[i];
2940  if (qi == -1) continue;
2941  // float ratio = _camera_data[i].f / _camera_data[qi].f;
2942  // if(ratio < 0.01 || ratio > 100) continue;
2943  _camera_data[i].f = fs[qi] / qcnum[qi];
2944  _camera_data[i].radial = rs[qi] / qcnum[qi];
2945  }
2946 
2948  std::fill(qlistw, qlistw + _num_camera * 2, 0);
2949 
2950  for (int i = 0; i < _num_imgpt_q; ++i) {
2951  int cidx = qmap[i * 2], qi = _focal_mask[cidx];
2952  Float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q;
2953  Float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0);
2954  qmapw[i * 2] = wi;
2955  qmapw[i * 2 + 1] = wr;
2956  qlistw[cidx * 2] = wi;
2957  qlistw[cidx * 2 + 1] = wr;
2958  }
2959 }
2960 
2962 template <class Float>
2964  size_t total_sz = 0;
2965  int plen = GetParameterLength(); // q = 8m + 3n
2966 
2968  ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1);
2969  ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1);
2970  ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1);
2971  ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1);
2972  ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1);
2973  ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1);
2974 
2976  unsigned int cblock_len = (__use_radial_distortion ? 64 : 56);
2977  ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 6 * _num_point,
2978  1); // 64m + 12n
2979  ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2); // 2k
2980  ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize);
2981 
2983  __memory_usage += total_sz;
2984  if (__verbose_level > 1)
2985  std::cout << "Memory for Conjugate Gradient Solver:\t"
2986  << (total_sz / 1024 / 1024) << "MB\n";
2987  return true;
2988 }
2989 
2991 template <class Float>
2993  if (!_cuVectorSJ.size()) return;
2994 
2995  if ((__jc_store_transpose || __jc_store_original) &&
2996  _cuJacobianPoint.size() && !__bundle_current_mode) {
2997  VectorF null;
2998  null.swap(_cuVectorSJ);
2999  EvaluateJacobians();
3000  null.swap(_cuVectorSJ);
3001  ComputeDiagonal(_cuVectorSJ);
3002  ComputeSQRT(_cuVectorSJ);
3003  } else {
3004  VectorF null;
3005  null.swap(_cuVectorSJ);
3006  EvaluateJacobians();
3007  ComputeBlockPC(0, true);
3008  null.swap(_cuVectorSJ);
3009  _cuVectorJJ.swap(_cuVectorSJ);
3010  ComputeRSQRT(_cuVectorSJ);
3011  }
3012 }
3013 
3014 template <class Float>
3016  if (__no_jacobian_store) return;
3017  if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original &&
3018  !__jc_store_transpose)
3019  return;
3020 
3021  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true);
3022 
3023  if (__jc_store_original || !__jc_store_transpose) {
3024  int fid = __jc_store_original
3025  ? (__jc_store_transpose ? FUNC_JJ_JCO_JCT_JP : FUNC_JJ_JCO_JP)
3026  : FUNC_JJ_JP;
3028  _num_imgpt, _num_camera, _cuCameraData.begin(), _cuPointData.begin(),
3029  _cuJacobianCamera.begin(), _cuJacobianPoint.begin(),
3030  &_cuProjectionMap.front(), _cuVectorSJ.begin(), _cuMeasurements.begin(),
3031  __jc_store_transpose ? &_cuCameraMeasurementListT.front() : NULL,
3032  __fixed_intrinsics, __use_radial_distortion, false,
3033  _cuJacobianCameraT.begin(), __num_cpu_thread[fid]);
3034  } else {
3035  ComputeJacobian(_num_imgpt, _num_camera, _cuCameraData.begin(),
3036  _cuPointData.begin(), _cuJacobianCameraT.begin(),
3037  _cuJacobianPoint.begin(), &_cuProjectionMap.front(),
3038  _cuVectorSJ.begin(), _cuMeasurements.begin(),
3039  &_cuCameraMeasurementListT.front(), __fixed_intrinsics,
3040  __use_radial_distortion, true, ((Float*)0),
3041  __num_cpu_thread[FUNC_JJ_JCT_JP]);
3042  }
3043  ++__num_jacobian_eval;
3044 }
3045 
3046 template <class Float>
3048  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true);
3049  if (mode == 0) mode = __bundle_current_mode;
3050 
3051  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
3052  if (_cuJacobianPoint.size()) {
3054  _num_imgpt, _num_camera, _num_point, E.begin(), JtE.begin(),
3055  _cuCameraData.begin(), _cuPointData.begin(), _cuMeasurements.begin(),
3056  &_cuProjectionMap.front(), &_cuCameraMeasurementMap.front(),
3057  &_cuCameraMeasurementList.front(), &_cuPointMeasurementMap.front(),
3058  _cuJacobianPoint.begin(), __fixed_intrinsics, __use_radial_distortion,
3059  mode, __num_cpu_thread[FUNC_JTE_]);
3060 
3061  if (_cuVectorSJ.size() && mode != 2)
3062  ProgramCPU::ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
3063  } else {
3064  ProgramCPU::ComputeJtE_(_num_imgpt, _num_camera, _num_point, E.begin(),
3065  JtE.begin(), _cuCameraData.begin(),
3066  _cuPointData.begin(), _cuMeasurements.begin(),
3067  &_cuProjectionMap.front(), __fixed_intrinsics,
3068  __use_radial_distortion, mode);
3069 
3071  // if(_cuVectorSJ.size()) ProgramCPU::ComputeVXY(JtE, _cuVectorSJ, JtE);
3072  if (!_cuVectorSJ.size()) {
3073  } else if (mode == 2)
3074  ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * POINT_ALIGN,
3075  _num_camera * 8);
3076  else if (mode == 1)
3077  ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
3078  else
3079  ComputeVXY(JtE, _cuVectorSJ, JtE);
3080  }
3081  } else if (__jc_store_transpose) {
3083  _num_camera, _num_point, E.begin(), _cuJacobianCameraT.begin(),
3084  &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3085  _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.begin(),
3086  true, mode, __num_cpu_thread[FUNC_JTEC_JCT],
3087  __num_cpu_thread[FUNC_JTEP]);
3088  } else {
3090  _num_camera, _num_point, E.begin(), _cuJacobianCamera.begin(),
3091  &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3092  _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.begin(),
3093  false, mode, __num_cpu_thread[FUNC_JTEC_JCO],
3094  __num_cpu_thread[FUNC_JTEP]);
3095  }
3096 
3097  if (mode != 2 && _num_imgpt_q > 0) {
3098  ProgramCPU::ComputeJQtEC(_num_camera, E.begin() + 2 * _num_imgpt,
3099  &_cuCameraQList.front(), _cuCameraQListW.begin(),
3100  _cuVectorSJ.begin(), JtE.begin());
3101  }
3102 }
3103 
3104 template <class Float>
3106  float damping, float& g_norm,
3107  float& g_inf) {
3108  // do not really compute if parameter not specified...
3109  // for large dataset, it never converges..
3110  g_inf = __lm_check_gradient ? float(ComputeVectorMax(_cuVectorJtE)) : 0;
3111  g_norm =
3112  __save_gradient_norm ? float(ComputeVectorNorm(_cuVectorJtE)) : g_inf;
3113  ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf);
3114 }
3115 
3116 template <class Float>
3118  VectorF& proj) {
3119  ++__num_projection_eval;
3120  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
3121  ComputeProjection(_num_imgpt, cam.begin(), point.begin(),
3122  _cuMeasurements.begin(), &_cuProjectionMap.front(),
3123  proj.begin(), __use_radial_distortion,
3124  __num_cpu_thread[FUNC_PJ]);
3125  if (_num_imgpt_q > 0)
3126  ComputeProjectionQ(_num_imgpt_q, cam.begin(), &_cuCameraQMap.front(),
3127  _cuCameraQMapW.begin(), proj.begin() + 2 * _num_imgpt);
3128  return (float)ComputeVectorNorm(proj, __num_cpu_thread[FUNC_VS]);
3129 }
3130 
3131 template <class Float>
3133  VectorF& proj) {
3134  ++__num_projection_eval;
3135  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
3136  ComputeProjectionX(_num_imgpt, cam.begin(), point.begin(),
3137  _cuMeasurements.begin(), &_cuProjectionMap.front(),
3138  proj.begin(), __use_radial_distortion,
3139  __num_cpu_thread[FUNC_PJ]);
3140  if (_num_imgpt_q > 0)
3141  ComputeProjectionQ(_num_imgpt_q, cam.begin(), &_cuCameraQMap.front(),
3142  _cuCameraQMapW.begin(), proj.begin() + 2 * _num_imgpt);
3143  return (float)ComputeVectorNorm(proj, __num_cpu_thread[FUNC_VS]);
3144 }
3145 
3146 template <class Float>
3148  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true);
3149  if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) ||
3150  !__jc_store_original) {
3152  _num_imgpt, _num_camera, X.begin(), JX.begin(), _cuCameraData.begin(),
3153  _cuPointData.begin(), _cuMeasurements.begin(), _cuVectorSJ.begin(),
3154  &_cuProjectionMap.front(), __fixed_intrinsics, __use_radial_distortion,
3155  mode, __num_cpu_thread[FUNC_JX_]);
3156  } else {
3157  ProgramCPU::ComputeJX(_num_imgpt, _num_camera, X.begin(),
3158  _cuJacobianCamera.begin(), _cuJacobianPoint.begin(),
3159  &_cuProjectionMap.front(), JX.begin(), mode,
3160  __num_cpu_thread[FUNC_JX]);
3161  }
3162 
3163  if (_num_imgpt_q > 0 && mode != 2) {
3164  ProgramCPU::ComputeJQX(_num_imgpt_q, X.begin(), &_cuCameraQMap.front(),
3165  _cuCameraQMapW.begin(), _cuVectorSJ.begin(),
3166  JX.begin() + 2 * _num_imgpt);
3167  }
3168 }
3169 
3170 template <class Float>
3171 void SparseBundleCPU<Float>::ComputeBlockPC(float lambda, bool dampd) {
3172  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true);
3173 
3174  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose &&
3175  __bundle_current_mode != 2)) {
3177  lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements,
3178  _cuProjectionMap, _cuVectorSJ, _cuCameraQListW, _cuVectorJJ, _cuBlockPC,
3179  __fixed_intrinsics, __use_radial_distortion, __bundle_current_mode);
3180  } else if (__jc_store_transpose) {
3182  _num_camera, _num_point, lambda, dampd, _cuJacobianCameraT.begin(),
3183  &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(),
3184  &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3185  _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(),
3186  _cuBlockPC.begin(), __use_radial_distortion, true,
3187  __num_cpu_thread[FUNC_BCC_JCT], __num_cpu_thread[FUNC_BCP],
3188  __bundle_current_mode);
3189  } else {
3191  _num_camera, _num_point, lambda, dampd, _cuJacobianCamera.begin(),
3192  &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(),
3193  &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(),
3194  _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(),
3195  _cuBlockPC.begin(), __use_radial_distortion, false,
3196  __num_cpu_thread[FUNC_BCC_JCO], __num_cpu_thread[FUNC_BCP],
3197  __bundle_current_mode);
3198  }
3199 }
3200 
3201 template <class Float>
3203  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true);
3204  MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC.begin(),
3205  v.begin(), pv.begin(), __use_radial_distortion, mode,
3206  __num_cpu_thread[FUNC_MPC],
3207  __num_cpu_thread[FUNC_MPP]);
3208 }
3209 
3210 template <class Float>
3212  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true);
3213  if (__no_jacobian_store) {
3214  } else if (__jc_store_transpose) {
3216  _cuJacobianCameraT, _cuCameraMeasurementMap, _cuJacobianPoint,
3217  _cuPointMeasurementMap, _cuCameraMeasurementList,
3218  _cuCameraQListW.begin(), JJ, true, __use_radial_distortion);
3219  } else if (__jc_store_original) {
3221  _cuJacobianCamera, _cuCameraMeasurementMap, _cuJacobianPoint,
3222  _cuPointMeasurementMap, _cuCameraMeasurementList,
3223  _cuCameraQListW.begin(), JJ, false, __use_radial_distortion);
3224  }
3225 }
3226 
3227 template <class Float>
3229  int incompatible_radial_distortion = 0;
3230  _cuMeasurements.resize(_num_imgpt * 2);
3231  if (__focal_normalize) {
3232  if (__focal_scaling == 1.0f) {
3233  //------------------------------------------------------------------
3235  vector<float> focals(_num_camera);
3236  for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f;
3237  std::nth_element(focals.begin(), focals.begin() + _num_camera / 2,
3238  focals.end());
3239  float median_focal_length = focals[_num_camera / 2];
3240  __focal_scaling = __data_normalize_median / median_focal_length;
3241  Float radial_factor = median_focal_length * median_focal_length * 4.0f;
3242 
3244 
3245  for (int i = 0; i < _num_imgpt * 2; ++i) {
3246  _cuMeasurements[i] = Float(_imgpt_data[i] * __focal_scaling);
3247  }
3248  for (int i = 0; i < _num_camera; ++i) {
3249  _camera_data[i].f *= __focal_scaling;
3250  if (!__use_radial_distortion) {
3251  } else if (__reset_initial_distortion) {
3252  _camera_data[i].radial = 0;
3253  } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
3254  incompatible_radial_distortion++;
3255  _camera_data[i].radial = 0;
3256  } else if (__use_radial_distortion == -1) {
3257  _camera_data[i].radial *= radial_factor;
3258  }
3259  }
3260  if (__verbose_level > 2)
3261  std::cout << "Focal length normalized by " << __focal_scaling << '\n';
3262  __reset_initial_distortion = false;
3263  }
3264  } else {
3265  if (__use_radial_distortion) {
3266  for (int i = 0; i < _num_camera; ++i) {
3267  if (__reset_initial_distortion) {
3268  _camera_data[i].radial = 0;
3269  } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
3270  _camera_data[i].radial = 0;
3271  incompatible_radial_distortion++;
3272  }
3273  }
3274  __reset_initial_distortion = false;
3275  }
3276  std::copy(_imgpt_data, _imgpt_data + _cuMeasurements.size(),
3277  _cuMeasurements.begin());
3278  }
3279 
3280  if (incompatible_radial_distortion) {
3281  std::cout << "WARNING: incompatible radial distortion input; reset to 0;\n";
3282  }
3283 }
3284 
3285 template <class Float>
3287  if (__depth_scaling == 1.0f) {
3288  const float dist_bound = 1.0f;
3289  vector<float> oz(_num_imgpt);
3290  vector<float> cpdist1(_num_camera, dist_bound);
3291  vector<float> cpdist2(_num_camera, -dist_bound);
3292  vector<int> camnpj(_num_camera, 0), cambpj(_num_camera, 0);
3293  int bad_point_count = 0;
3294  for (int i = 0; i < _num_imgpt; ++i) {
3295  int cmidx = _camera_idx[i];
3296  CameraT* cam = _camera_data + cmidx;
3297  float* rz = cam->m[2];
3298  float* x = _point_data + 4 * _point_idx[i];
3299  oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]);
3300 
3302  // points behind camera may causes big problem
3303  float ozr = oz[i] / cam->t[2];
3304  if (fabs(ozr) < __depth_check_epsilon) {
3305  bad_point_count++;
3306  float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] +
3307  cam->m[0][2] * x[2] + cam->t[0]);
3308  float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] +
3309  cam->m[1][2] * x[2] + cam->t[1]);
3310  float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1];
3311  bool checkx = fabs(mx) > fabs(my);
3312  if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) ||
3313  (!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) {
3314  if (__verbose_level > 3)
3315  std::cout << "Warning: proj of #" << cmidx
3316  << " on the wrong side, oz = " << oz[i] << " ("
3317  << (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx
3318  << ',' << my << ")\n";
3320  if (oz[i] > 0)
3321  cpdist2[cmidx] = 0;
3322  else
3323  cpdist1[cmidx] = 0;
3324  }
3325  if (oz[i] >= 0)
3326  cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]);
3327  else
3328  cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]);
3329  }
3330  if (oz[i] < 0) {
3331  __num_point_behind++;
3332  cambpj[cmidx]++;
3333  }
3334  camnpj[cmidx]++;
3335  }
3336  if (bad_point_count > 0 && __depth_degeneracy_fix) {
3337  if (!__focal_normalize || !__depth_normalize)
3338  std::cout << "Enable data normalization on degeneracy\n";
3339  __focal_normalize = true;
3340  __depth_normalize = true;
3341  }
3342  if (__depth_normalize) {
3343  std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end());
3344  float oz_median = oz[_num_imgpt / 2];
3345  float shift_min = std::min(oz_median * 0.001f, 1.0f);
3346  float dist_threshold = shift_min * 0.1f;
3347  __depth_scaling = (1.0 / oz_median) / __data_normalize_median;
3348  if (__verbose_level > 2)
3349  std::cout << "Depth normalized by " << __depth_scaling << " ("
3350  << oz_median << ")\n";
3351 
3352  for (int i = 0; i < _num_camera; ++i) {
3353  // move the camera a little bit?
3354  if (!__depth_degeneracy_fix) {
3355  } else if ((cpdist1[i] < dist_threshold ||
3356  cpdist2[i] > -dist_threshold)) {
3357  float shift_epsilon = fabs(_camera_data[i].t[2] * FLT_EPSILON);
3358  float shift = std::max(shift_min, shift_epsilon);
3359  bool boths =
3360  cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold;
3361  _camera_data[i].t[2] += shift;
3362  if (__verbose_level > 3)
3363  std::cout << "Adjust C" << std::setw(5) << i << " by "
3364  << std::setw(12) << shift << " [B" << std::setw(2)
3365  << cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] ["
3366  << (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", "
3367  << cpdist2[i] << "]\n";
3368  __num_camera_modified++;
3369  }
3370  _camera_data[i].t[0] *= __depth_scaling;
3371  _camera_data[i].t[1] *= __depth_scaling;
3372  _camera_data[i].t[2] *= __depth_scaling;
3373  }
3374  for (int i = 0; i < _num_point; ++i) {
3376  _point_data[4 * i + 0] *= __depth_scaling;
3377  _point_data[4 * i + 1] *= __depth_scaling;
3378  _point_data[4 * i + 2] *= __depth_scaling;
3379  }
3380  }
3381  if (__num_point_behind > 0)
3382  std::cout << "WARNING: " << __num_point_behind
3383  << " points are behind cameras.\n";
3384  if (__num_camera_modified > 0)
3385  std::cout << "WARNING: " << __num_camera_modified
3386  << " camera moved to avoid degeneracy.\n";
3387  }
3388 }
3389 
3390 template <class Float>
3392  if (__focal_normalize && __focal_scaling != 1.0f) {
3393  float squared_focal_factor = (__focal_scaling * __focal_scaling);
3394  for (int i = 0; i < _num_camera; ++i) {
3395  _camera_data[i].f /= __focal_scaling;
3396  if (__use_radial_distortion == -1)
3397  _camera_data[i].radial *= squared_focal_factor;
3398  _camera_data[i].distortion_type = __use_radial_distortion;
3399  }
3400  _projection_sse /= squared_focal_factor;
3401  __focal_scaling = 1.0f;
3402  } else if (__use_radial_distortion) {
3403  for (int i = 0; i < _num_camera; ++i)
3404  _camera_data[i].distortion_type = __use_radial_distortion;
3405  }
3406 
3407  if (__depth_normalize && __depth_scaling != 1.0f) {
3408  for (int i = 0; i < _num_camera; ++i) {
3409  _camera_data[i].t[0] /= __depth_scaling;
3410  _camera_data[i].t[1] /= __depth_scaling;
3411  _camera_data[i].t[2] /= __depth_scaling;
3412  }
3413  for (int i = 0; i < _num_point; ++i) {
3414  _point_data[4 * i + 0] /= __depth_scaling;
3415  _point_data[4 * i + 1] /= __depth_scaling;
3416  _point_data[4 * i + 2] /= __depth_scaling;
3417  }
3418  __depth_scaling = 1.0f;
3419  }
3420 }
3421 
3422 template <class Float>
3424  //----------------------------------------------------------
3425  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
3426  //-------------------------------------------------------------
3427  TimerBA timer(this, TIMER_CG_ITERATION);
3428  __recent_cg_status = ' ';
3429 
3430  // diagonal for jacobian preconditioning...
3431  int plen = GetParameterLength();
3432  VectorF null;
3433  VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal
3434  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3435 
3437 
3439  // B = [BC 0 ; 0 BP]
3440  // m = [mc 0; 0 mp];
3441  // A x= BC * x - JcT * Jp * mp * JpT * Jc * x
3442  // = JcT * Jc x + lambda * D * x + ........
3444 
3445  VectorF r;
3446  r.set(_cuVectorRK.data(), 8 * _num_camera);
3447  VectorF p;
3448  p.set(_cuVectorPK.data(), 8 * _num_camera);
3449  VectorF z;
3450  z.set(_cuVectorZK.data(), 8 * _num_camera);
3451  VectorF x;
3452  x.set(_cuVectorXK.data(), 8 * _num_camera);
3453  VectorF d;
3454  d.set(VectorDP.data(), 8 * _num_camera);
3455 
3456  VectorF& u = _cuVectorRK;
3457  VectorF& v = _cuVectorPK;
3458  VectorF up;
3459  up.set(u.data() + 8 * _num_camera, 3 * _num_point);
3460  VectorF vp;
3461  vp.set(v.data() + 8 * _num_camera, 3 * _num_point);
3462  VectorF uc;
3463  uc.set(z.data(), 8 * _num_camera);
3464 
3465  VectorF& e = _cuVectorJX;
3466  VectorF& e2 = _cuImageProj;
3467 
3468  ApplyBlockPC(_cuVectorJtE, u, 2);
3469  ComputeJX(u, e, 2);
3470  ComputeJtE(e, uc, 1);
3471  ComputeSAXPY(Float(-1.0f), uc, _cuVectorJtE, r); // r
3472  ApplyBlockPC(r, p, 1); // z = p = M r
3473 
3474  float_t rtz0 = (float_t)ComputeVectorDot(r, p); // r(0)' * z(0)
3475  ComputeJX(p, e, 1); // Jc * x
3476  ComputeJtE(e, u, 2); // JpT * jc * x
3477  ApplyBlockPC(u, v, 2);
3478  float_t qtq0 =
3479  (float_t)ComputeVectorNorm(e, __num_cpu_thread[FUNC_VS]); // q(0)' * q(0)
3480  float_t pdp0 = (float_t)ComputeVectorNormW(p, d); // p(0)' * DDD * p(0)
3481  float_t uv0 = (float_t)ComputeVectorDot(up, vp);
3482  float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0);
3483 
3484  if (__verbose_cg_iteration)
3485  std::cout << " --0,\t alpha = " << alpha0
3486  << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
3487  if (!std::isfinite(alpha0)) {
3488  return 0;
3489  }
3490  if (alpha0 == 0) {
3491  __recent_cg_status = 'I';
3492  return 1;
3493  }
3494 
3496  ComputeSAX((Float)alpha0, p, x); // x(k+1) = x(k) + a(k) * p(k)
3497  ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p
3498  ComputeSAXPY(Float(-1.0f), e2, e, e, __num_cpu_thread[FUNC_VV]);
3499  ComputeJtE(e, uc, 1); // JcT * ....
3500  ComputeSXYPZ((Float)lambda, d, p, uc, uc);
3501  ComputeSAXPY((Float)-alpha0, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk
3502 
3504  float_t rtzk = rtz0, rtz_min = rtz0, betak;
3505  int iteration = 1;
3506  ++__num_cg_iteration;
3507 
3508  while (true) {
3509  ApplyBlockPC(r, z, 1);
3510 
3512  float_t rtzp = rtzk;
3513  rtzk = (float_t)ComputeVectorDot(
3514  r, z); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
3515  float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
3516  if (rtz_ratio < __cg_norm_threshold) {
3517  if (__recent_cg_status == ' ')
3518  __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
3519  ? '0' + iteration
3520  : 'N';
3521  if (iteration >= __cg_min_iteration) break;
3522  }
3524  betak = rtzk / rtzp; // beta
3525  rtz_min = std::min(rtz_min, rtzk);
3526 
3527  ComputeSAXPY((Float)betak, p, z, p); // p(k) = z(k) + b(k) * p(k - 1)
3528  ComputeJX(p, e, 1); // Jc * p
3529  ComputeJtE(e, u, 2); // JpT * jc * p
3530  ApplyBlockPC(u, v, 2);
3532 
3533  float_t qtqk =
3534  (float_t)ComputeVectorNorm(e, __num_cpu_thread[FUNC_VS]); // q(k)' q(k)
3535  float_t pdpk = (float_t)ComputeVectorNormW(p, d); // p(k)' * DDD * p(k)
3536  float_t uvk = (float_t)ComputeVectorDot(up, vp);
3537  float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk);
3538 
3540  if (__verbose_cg_iteration)
3541  std::cout << " --" << iteration << ",\t alpha= " << alphak
3542  << ", rtzk/rtz0 = " << rtz_ratio
3543  << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
3544 
3546  if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
3547  __recent_cg_status = 'X';
3548  break;
3549  } // something doesn't converge..
3550 
3552  ComputeSAXPY((Float)alphak, p, x, x); // x(k+1) = x(k) + a(k) * p(k)
3553 
3555  ++iteration;
3556  ++__num_cg_iteration;
3557  if (iteration >= std::min(__cg_max_iteration, plen)) break;
3558 
3559  ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p
3560  ComputeSAXPY((Float)-1.0f, e2, e, e, __num_cpu_thread[FUNC_VV]);
3561  ComputeJtE(e, uc, 1); // JcT * ....
3562  ComputeSXYPZ((Float)lambda, d, p, uc, uc);
3563  ComputeSAXPY((Float)-alphak, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk
3564  }
3565 
3566  ComputeJX(x, e, 1);
3567  ComputeJtE(e, u, 2);
3568  VectorF jte_p;
3569  jte_p.set(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 3);
3570  ComputeSAXPY((Float)-1.0f, up, jte_p, vp);
3571  ApplyBlockPC(v, _cuVectorXK, 2);
3572  return iteration;
3573 }
3574 
3575 template <class Float>
3577  //----------------------------------------------------------
3578  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
3579  //-------------------------------------------------------------
3580  TimerBA timer(this, TIMER_CG_ITERATION);
3581  __recent_cg_status = ' ';
3582 
3583  // diagonal for jacobian preconditioning...
3584  int plen = GetParameterLength();
3585  VectorF null;
3586  VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal
3587  VectorF& VectorQK = _cuVectorZK; // temporary
3588  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3589 
3591  ApplyBlockPC(_cuVectorJtE,
3592  _cuVectorPK); // z(0) = p(0) = M * r(0)//r(0) = Jt * e
3593  ComputeJX(_cuVectorPK, _cuVectorJX); // q(0) = J * p(0)
3594 
3596  float_t rtz0 =
3597  (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK); // r(0)' * z(0)
3599  _cuVectorJX, __num_cpu_thread[FUNC_VS]); // q(0)' * q(0)
3600  float_t ptdp0 =
3601  (float_t)ComputeVectorNormW(_cuVectorPK, VectorDP); // p(0)' * DDD * p(0)
3602  float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0);
3603 
3604  if (__verbose_cg_iteration)
3605  std::cout << " --0,\t alpha = " << alpha0
3606  << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
3607  if (!std::isfinite(alpha0)) {
3608  return 0;
3609  }
3610  if (alpha0 == 0) {
3611  __recent_cg_status = 'I';
3612  return 1;
3613  }
3614 
3616 
3617  ComputeSAX((Float)alpha0, _cuVectorPK,
3618  _cuVectorXK); // x(k+1) = x(k) + a(k) * p(k)
3619  ComputeJtE(_cuVectorJX, VectorQK); // Jt * (J * p0)
3620 
3621  ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK,
3622  VectorQK); // Jt * J * p0 + lambda * DDD * p0
3623 
3624  ComputeSAXPY(
3625  (Float)-alpha0, VectorQK, _cuVectorJtE,
3626  _cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ;
3627 
3628  float_t rtzk = rtz0, rtz_min = rtz0, betak;
3629  int iteration = 1;
3630  ++__num_cg_iteration;
3631 
3632  while (true) {
3633  ApplyBlockPC(_cuVectorRK, _cuVectorZK);
3634 
3636  float_t rtzp = rtzk;
3637  rtzk = (float_t)ComputeVectorDot(
3638  _cuVectorRK, _cuVectorZK); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
3639  float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
3640  if (rtz_ratio < __cg_norm_threshold) {
3641  if (__recent_cg_status == ' ')
3642  __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
3643  ? '0' + iteration
3644  : 'N';
3645  if (iteration >= __cg_min_iteration) break;
3646  }
3648  betak = rtzk / rtzp; // beta
3649  rtz_min = std::min(rtz_min, rtzk);
3650 
3651  ComputeSAXPY((Float)betak, _cuVectorPK, _cuVectorZK,
3652  _cuVectorPK); // p(k) = z(k) + b(k) * p(k - 1)
3653  ComputeJX(_cuVectorPK, _cuVectorJX); // q(k) = J * p(k)
3655 
3657  _cuVectorJX, __num_cpu_thread[FUNC_VS]); // q(k)' q(k)
3659  _cuVectorPK, VectorDP); // p(k)' * DDD * p(k)
3660 
3661  float_t alphak = rtzk / (qtqk + lambda * ptdpk);
3662 
3664  if (__verbose_cg_iteration)
3665  std::cout << " --" << iteration << ",\t alpha= " << alphak
3666  << ", rtzk/rtz0 = " << rtz_ratio
3667  << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
3668 
3670  if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
3671  __recent_cg_status = 'X';
3672  break;
3673  } // something doesn't converge..
3674 
3676  ComputeSAXPY((Float)alphak, _cuVectorPK, _cuVectorXK,
3677  _cuVectorXK); // x(k+1) = x(k) + a(k) * p(k)
3678 
3680  ++iteration;
3681  ++__num_cg_iteration;
3682  if (iteration >= std::min(__cg_max_iteration, plen)) break;
3683 
3684  if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) {
3686  ComputeJX(_cuVectorXK, _cuVectorJX);
3687  ComputeJtE(_cuVectorJX, VectorQK);
3688  ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK);
3689  ComputeSAXPY((Float)-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK);
3690  } else {
3691  ComputeJtE(_cuVectorJX, VectorQK);
3692  ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK,
3693  VectorQK); //
3694  ComputeSAXPY(
3695  (Float)-alphak, VectorQK, _cuVectorRK,
3696  _cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ;
3697  }
3698  }
3699  return iteration;
3700 }
3701 
3702 template <class Float>
3704  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3705  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3706  ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1);
3707  return 1;
3708  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3709  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
3710  ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2);
3711  return 1;
3712  } else {
3714  return __cg_schur_complement ? SolveNormalEquationPCGX(lambda)
3715  : SolveNormalEquationPCGB(lambda);
3716  }
3717 }
3718 
3719 template <class Float>
3722  ofstream jo("j.txt");
3723  int cn = __use_radial_distortion ? 8 : 7;
3724  int width = cn * _num_camera + 3 * _num_point;
3725  jo << "%%MatrixMarket matrix coordinate real general\n";
3726  jo << (_num_imgpt * 2) << " " << width << " " << (cn + 3) * _num_imgpt * 2
3727  << '\n';
3728  for (int i = 0; i < _num_imgpt; ++i) {
3729  int ci = _camera_idx[i];
3730  int pi = _point_idx[i];
3731  int row = i * 2 + 1;
3732  // Float * jc = _cuJacobianCamera.data() + i * 16;
3733  // Float * jp = _cuJacobianPoint.data() + i * 6;
3734  int idx1 = ci * cn;
3735  int idx2 = _num_camera * cn + 3 * pi;
3736 
3737  for (int k = 0; k < 2; ++k, ++row) {
3738  for (int j = 0; j < cn; ++j) {
3739  jo << row << " " << (idx1 + j + 1) << " 1\n";
3740  }
3741  for (int j = 0; j < 3; ++j) {
3742  jo << row << " " << (idx2 + j + 1) << " 1\n";
3743  }
3744  }
3745  }
3746 
3747  ofstream jt("jt.txt");
3748  jt << "%%MatrixMarket matrix coordinate real general\n";
3749  jt << width << " " << (_num_imgpt * 2) << " " << (cn + 3) * _num_imgpt * 2
3750  << '\n';
3751 
3752  int* lisc = &_cuCameraMeasurementList[0];
3753  int* mapc = &_cuCameraMeasurementMap[0];
3754  int* mapp = &_cuPointMeasurementMap[0];
3755 
3756  for (int i = 0; i < _num_camera; ++i) {
3757  int c0 = mapc[i];
3758  int c1 = mapc[i + 1];
3759  for (int k = 0; k < cn; ++k) {
3760  int row = i * cn + k + 1;
3761  for (int j = c0; j < c1; ++j)
3762  jt << row << " " << (lisc[j] * 2 + 1) << " 1\n" << row << " "
3763  << (2 * lisc[j] + 2) << " 1\n";
3764  ;
3765  }
3766  }
3767  for (int i = 0; i < _num_point; ++i) {
3768  int p0 = mapp[i];
3769  int p1 = mapp[i + 1];
3770  for (int k = 0; k < 3; ++k) {
3771  int row = i * 3 + _num_camera * cn + k + 1;
3772  for (int j = p0; j < p1; ++j)
3773  jt << row << " " << (2 * j + 1) << " 1\n" << row << " " << (2 * j + 2)
3774  << " 1\n";
3775  ;
3776  }
3777  }
3778 }
3779 
3780 template <class Float>
3782  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
3783  EvaluateJacobians();
3784  ComputeJtE(_cuImageProj, _cuVectorJtE);
3785  if (reduced)
3786  SolveNormalEquationPCGX(__lm_initial_damp);
3787  else
3788  SolveNormalEquationPCGB(__lm_initial_damp);
3789  UpdateCameraPoint(_cuVectorZK, _cuImageProj);
3790  ComputeVectorDot(_cuVectorXK, _cuVectorJtE);
3791  ComputeJX(_cuVectorXK, _cuVectorJX);
3792  ComputeVectorNorm(_cuVectorJX, __num_cpu_thread[FUNC_VS]);
3793 }
3794 
3795 template <class Float>
3797  VectorF& cuImageTempProj) {
3798  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true);
3799 
3800  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3801  if (__jacobian_normalize)
3802  ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera);
3804  _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
3805  _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
3806  return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj);
3807  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3808  if (__jacobian_normalize)
3809  ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, _num_point * POINT_ALIGN,
3810  _num_camera * 8);
3812  _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
3813  _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
3814  return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj);
3815  } else {
3816  if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx);
3818  _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
3819  _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
3820  return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj);
3821  }
3822 }
3823 
3824 template <class Float>
3825 float SparseBundleCPU<Float>::SaveUpdatedSystem(float residual_reduction,
3826  float dx_sqnorm,
3827  float damping) {
3828  float expected_reduction;
3829  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3830  VectorF xk;
3831  xk.set(_cuVectorXK.data(), 8 * _num_camera);
3832  VectorF jte;
3833  jte.set(_cuVectorJtE.data(), 8 * _num_camera);
3834  float dxtg = (float)ComputeVectorDot(xk, jte);
3835  if (__lm_use_diagonal_damp) {
3836  VectorF jj;
3837  jj.set(_cuVectorJJ.data(), 8 * _num_camera);
3838  float dq = (float)ComputeVectorNormW(xk, jj);
3839  expected_reduction = damping * dq + dxtg;
3840  } else {
3841  expected_reduction = damping * dx_sqnorm + dxtg;
3842  }
3843  _cuCameraData.swap(_cuCameraDataEX);
3844  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3845  VectorF xk;
3846  xk.set(_cuVectorXK.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
3847  VectorF jte;
3848  jte.set(_cuVectorJtE.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
3849  float dxtg = (float)ComputeVectorDot(xk, jte);
3850  if (__lm_use_diagonal_damp) {
3851  VectorF jj;
3852  jj.set(_cuVectorJJ.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
3853  float dq = (float)ComputeVectorNormW(xk, jj);
3854  expected_reduction = damping * dq + dxtg;
3855  } else {
3856  expected_reduction = damping * dx_sqnorm + dxtg;
3857  }
3858  _cuPointData.swap(_cuPointDataEX);
3859  } else {
3860  float dxtg = (float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE);
3861  if (__accurate_gain_ratio) {
3862  ComputeJX(_cuVectorXK, _cuVectorJX);
3863  float njx =
3864  (float)ComputeVectorNorm(_cuVectorJX, __num_cpu_thread[FUNC_VS]);
3865  expected_reduction = 2.0f * dxtg - njx;
3866 
3867  // could the expected reduction be negative??? not sure
3868  if (expected_reduction <= 0)
3869  expected_reduction = 0.001f * residual_reduction;
3870  } else if (__lm_use_diagonal_damp) {
3871  float dq = (float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ);
3872  expected_reduction = damping * dq + dxtg;
3873  } else {
3874  expected_reduction = damping * dx_sqnorm + dxtg;
3875  }
3877  _cuCameraData.swap(_cuCameraDataEX);
3878  _cuPointData.swap(_cuPointDataEX);
3879  }
3881  return float(residual_reduction / expected_reduction);
3882 }
3883 
3884 template <class Float>
3886  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3887  _cuJacobianPoint.resize(0);
3888  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3889  _cuJacobianCamera.resize(0);
3890  _cuJacobianCameraT.resize(0);
3891  }
3892 }
3893 
3894 template <class Float>
3896  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
3897  VectorF temp;
3898  temp.set(_cuVectorXK.data(), 8 * _num_camera);
3899  return (float)ComputeVectorNorm(temp);
3900  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
3901  VectorF temp;
3902  temp.set(_cuVectorXK.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
3903  return (float)ComputeVectorNorm(temp);
3904  } else {
3905  return (float)ComputeVectorNorm(_cuVectorXK);
3906  }
3907 }
3908 
3909 template <class Float>
3912  TimerBA timer(this, TIMER_OPTIMIZATION);
3913 
3915  float mse_convert_ratio =
3916  1.0f / (_num_imgpt * __focal_scaling * __focal_scaling);
3917  float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f;
3918  const int edwidth = __verbose_sse ? 12 : 8;
3919  _projection_sse =
3920  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
3921  __initial_mse = __final_mse = _projection_sse * mse_convert_ratio;
3922 
3923  // compute jacobian diagonals for normalization
3924  if (__jacobian_normalize) PrepareJacobianNormalization();
3925 
3926  // evalaute jacobian
3927  EvaluateJacobians();
3928  ComputeJtE(_cuImageProj, _cuVectorJtE);
3930  if (__verbose_level)
3931  std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean")
3932  << " squared error = " << __initial_mse * error_display_ratio
3933  << "\n----------------------------------------------\n";
3934 
3936  VectorF& cuImageTempProj = _cuVectorJX;
3937  // VectorF& cuVectorTempJX = _cuVectorJX;
3938  VectorF& cuVectorDX = _cuVectorSJ.size() ? _cuVectorZK : _cuVectorXK;
3939 
3941  float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf;
3942  SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm,
3943  g_inf);
3944 
3946  std::cout << std::left;
3947  for (int i = 0; i < __lm_max_iteration && !__abort_flag;
3948  __current_iteration = (++i)) {
3950  int num_cg_iteration = SolveNormalEquation(damping);
3951 
3952  // there must be NaN somewhere
3953  if (num_cg_iteration == 0) {
3954  if (__verbose_level)
3955  std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n";
3956  __pba_return_code = 'E';
3957  break;
3958  }
3959 
3960  // there must be infinity somewhere
3961  if (__recent_cg_status == 'I') {
3962  std::cout << "#" << std::setw(3) << i << " 0 I e=" << std::setw(edwidth)
3963  << "------- "
3964  << " u=" << std::setprecision(3) << std::setw(9) << damping
3965  << '\n' << std::setprecision(6);
3967  damping = damping * damping_adjust;
3968  damping_adjust = 2.0f * damping_adjust;
3969  --i;
3970  continue;
3971  }
3972 
3974  ++__num_lm_iteration;
3975 
3977  float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm);
3978 
3979  // In this library, we check absolute difference instead of realtive
3980  // difference
3981  if (dx_norm <= __lm_delta_threshold) {
3982  // damping factor must be way too big...or it converges
3983  if (__verbose_level > 1)
3984  std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
3985  << num_cg_iteration << char(__recent_cg_status)
3986  << " quit on too small change (" << dx_norm << " < "
3987  << __lm_delta_threshold << ")\n";
3988  __pba_return_code = 'S';
3989  break;
3990  }
3992  // update structure and motion, check reprojection error
3993  float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj);
3994  float average_residual = new_residual * mse_convert_ratio;
3995  float residual_reduction = _projection_sse - new_residual;
3996 
3997  // do we find a better solution?
3998  if (std::isfinite(new_residual) && residual_reduction > 0) {
4000  float relative_reduction = 1.0f - (new_residual / _projection_sse);
4001 
4003  __num_lm_success++; // increase counter
4004  _projection_sse = new_residual; // save the new residual
4005  _cuImageProj.swap(cuImageTempProj); // save the new projection
4006 
4008  float gain_ratio =
4009  SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping);
4010 
4012  SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping,
4013  g_norm, g_inf);
4014 
4016  if (__verbose_level > 1)
4017  std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
4018  << num_cg_iteration << char(__recent_cg_status)
4019  << " e=" << std::setw(edwidth)
4020  << average_residual * error_display_ratio
4021  << " u=" << std::setprecision(3) << std::setw(9) << damping
4022  << " r=" << std::setw(6)
4023  << floor(gain_ratio * 1000.f) * 0.001f
4024  << " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " "
4025  << std::setw(9) << relative_reduction << ' ' << std::setw(9)
4026  << dx_norm << " t=" << int(BundleTimerGetNow()) << "\n"
4027  << std::setprecision(6);
4028 
4030  if (!IsTimeBudgetAvailable()) {
4031  if (__verbose_level > 1)
4032  std::cout << "#" << std::setw(3) << i << " used up time budget.\n";
4033  __pba_return_code = 'T';
4034  break;
4035  } else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) {
4036  if (__verbose_level > 1)
4037  std::cout << "#" << std::setw(3) << i
4038  << " converged with small gradient\n";
4039  __pba_return_code = 'G';
4040  break;
4041  } else if (average_residual * error_display_ratio <= __lm_mse_threshold) {
4042  if (__verbose_level > 1)
4043  std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n";
4044  __pba_return_code = 'M';
4045  break;
4046  } else {
4048  float temp = gain_ratio * 2.0f - 1.0f;
4049  float adaptive_adjust = 1.0f - temp * temp * temp; // powf(, 3.0f); //
4050  float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust);
4051 
4053  damping = damping * auto_adjust;
4054  damping_adjust = 2.0f;
4055  if (damping < __lm_minimum_damp)
4056  damping = __lm_minimum_damp;
4057  else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp &&
4058  __lm_use_diagonal_damp)
4059  damping = __lm_maximum_damp;
4060 
4061  EvaluateJacobians();
4062  ComputeJtE(_cuImageProj, _cuVectorJtE);
4063  }
4064  } else {
4065  if (__verbose_level > 1)
4066  std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
4067  << num_cg_iteration << char(__recent_cg_status)
4068  << " e=" << std::setw(edwidth) << std::left
4069  << average_residual * error_display_ratio
4070  << " u=" << std::setprecision(3) << std::setw(9) << damping
4071  << " r=----- " << (__lm_check_gradient || __save_gradient_norm
4072  ? " g=---------"
4073  : " g=0")
4074  << " --------- " << std::setw(9) << dx_norm
4075  << " t=" << int(BundleTimerGetNow()) << "\n"
4076  << std::setprecision(6);
4077 
4078  if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp &&
4079  damping > __lm_damping_auto_switch) {
4080  __lm_use_diagonal_damp = false;
4081  damping = __lm_damping_auto_switch;
4082  damping_adjust = 2.0f;
4083  if (__verbose_level > 1)
4084  std::cout << "NOTE: switch to damping with an identity matix\n";
4085  } else {
4087  damping = damping * damping_adjust;
4088  damping_adjust = 2.0f * damping_adjust;
4089  }
4090  }
4091 
4092  if (__verbose_level == 1) std::cout << '.';
4093  }
4094 
4095  __final_mse = float(_projection_sse * mse_convert_ratio);
4096  __final_mse_x =
4097  __use_radial_distortion
4098  ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) *
4099  mse_convert_ratio
4100  : __final_mse;
4101 }
4102 
4103 #define PROFILE_REPORT2(A, T) \
4104  std::cout << std::setw(24) << A << ": " << (T) << "\n";
4105 
4106 #define PROFILE_REPORT(A) \
4107  std::cout << std::setw(24) << A << ": " \
4108  << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n";
4109 
4110 #define PROFILE_(B) \
4111  BundleTimerStart(TIMER_PROFILE_STEP); \
4112  for (int i = 0; i < repeat; ++i) { \
4113  B; \
4114  } \
4115  BundleTimerSwitch(TIMER_PROFILE_STEP);
4116 
4117 #define PROFILE(A, B) PROFILE_(A B) PROFILE_REPORT(#A)
4118 #define PROXILE(A, B) PROFILE_(B) PROFILE_REPORT(A)
4119 #define PROTILE(FID, A, B) \
4120  { \
4121  float tbest = FLT_MAX; \
4122  int nbest = 1; \
4123  int nto = nthread[FID]; \
4124  { \
4125  std::ostringstream os1; \
4126  os1 << #A "(" << nto << ")"; \
4127  PROXILE(os1.str(), A B); \
4128  } \
4129  for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) { \
4130  nthread[FID] = j; \
4131  PROFILE_(A B); \
4132  float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
4133  if (t > tbest) { \
4134  if (j >= max(nto, 16)) break; \
4135  } else { \
4136  tbest = t; \
4137  nbest = j; \
4138  } \
4139  } \
4140  if (nto != 0) nthread[FID] = nbest; \
4141  { \
4142  std::ostringstream os; \
4143  os << #A "(" << nbest << ")"; \
4144  PROFILE_REPORT2(os.str(), tbest); \
4145  } \
4146  }
4147 
4148 #define PROTILE2(FID1, FID2, A, B) \
4149  { \
4150  int nt1 = nthread[FID1], nt2 = nthread[FID2]; \
4151  { \
4152  std::ostringstream os1; \
4153  os1 << #A "(" << nt1 << "," << nt2 << ")"; \
4154  PROXILE(os1.str(), A B); \
4155  } \
4156  float tbest = FLT_MAX; \
4157  int nbest1 = 1, nbest2 = 1; \
4158  nthread[FID2] = 1; \
4159  for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) { \
4160  nthread[FID1] = j; \
4161  PROFILE_(A B); \
4162  float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
4163  if (t > tbest) { \
4164  if (j >= max(nt1, 16)) break; \
4165  } else { \
4166  tbest = t; \
4167  nbest1 = j; \
4168  } \
4169  } \
4170  nthread[FID1] = nbest1; \
4171  for (int j = 2; j <= THREAD_NUM_MAX; j *= 2) { \
4172  nthread[FID2] = j; \
4173  PROFILE_(A B); \
4174  float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
4175  if (t > tbest) { \
4176  if (j >= max(nt2, 16)) break; \
4177  } else { \
4178  tbest = t; \
4179  nbest2 = j; \
4180  } \
4181  } \
4182  nthread[FID2] = nbest2; \
4183  { \
4184  std::ostringstream os; \
4185  os << #A "(" << nbest1 << "," << nbest2 << ")"; \
4186  PROFILE_REPORT2(os.str(), tbest); \
4187  } \
4188  if (nt1 == 0) nthread[FID1] = 0; \
4189  if (nt2 == 0) nthread[FID2] = 0; \
4190  }
4191 
4192 template <class Float>
4194  const int repeat = std::max(__profile_pba, 1);
4195  int* nthread = __num_cpu_thread;
4196  std::cout << "---------------------------------\n"
4197  "| Run profiling steps ("
4198  << repeat << ") |\n"
4199  "---------------------------------\n"
4200  << std::left;
4201  ;
4202 
4204  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
4205  if (__jacobian_normalize) PrepareJacobianNormalization();
4206  EvaluateJacobians();
4207  ComputeJtE(_cuImageProj, _cuVectorJtE);
4208  ComputeBlockPC(__lm_initial_damp, true);
4210  do {
4211  if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 &&
4212  SolveNormalEquationPCGB(__lm_initial_damp) == 10)
4213  break;
4214  __lm_initial_damp *= 2.0f;
4215  } while (__lm_initial_damp < 1024.0f);
4216  std::cout << "damping set to " << __lm_initial_damp << " for profiling\n"
4217  << "---------------------------------\n";
4219  {
4220  int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration;
4221  __cg_max_iteration = __cg_min_iteration = 10;
4222  __num_cg_iteration = 0;
4223  PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp));
4224  if (__num_cg_iteration != 100)
4225  std::cout << __num_cg_iteration << " cg iterations in all\n";
4227  __num_cg_iteration = 0;
4228  PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp));
4229  if (__num_cg_iteration != 100)
4230  std::cout << __num_cg_iteration << " cg iterations in all\n";
4231  std::cout << "---------------------------------\n";
4233  __num_cg_iteration = 0;
4234  PROXILE("Single iteration LMX", RunTestIterationLM(true));
4235  if (__num_cg_iteration != 100)
4236  std::cout << __num_cg_iteration << " cg iterations in all\n";
4238  __num_cg_iteration = 0;
4239  PROXILE("Single iteration LMB", RunTestIterationLM(false));
4240  if (__num_cg_iteration != 100)
4241  std::cout << __num_cg_iteration << " cg iterations in all\n";
4242  std::cout << "---------------------------------\n";
4243  __cg_max_iteration = cgmax;
4244  __cg_min_iteration = cgmin;
4245  }
4246 
4248  PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj));
4249  PROFILE(ComputeVectorNorm, (_cuVectorXK));
4250  PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK));
4251  PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK));
4252  PROFILE(ComputeSAXPY, ((Float)0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK));
4254  ((Float)0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK));
4255  std::cout << "---------------------------------\n";
4256  PROTILE(FUNC_VS, ComputeVectorNorm,
4257  (_cuImageProj, nthread[FUNC_VS])); // reset the parameter to 0
4258 
4260  {
4261  avec<Float> temp1(_cuImageProj.size()), temp2(_cuImageProj.size());
4262  SetVectorZero(temp1);
4263  PROTILE(FUNC_VV, ComputeSAXPY,
4264  ((Float)0.01f, _cuImageProj, temp1, temp2, nthread[FUNC_VV]));
4265  }
4266 
4267  std::cout << "---------------------------------\n";
4268  __multiply_jx_usenoj = false;
4269 
4271  PROTILE(FUNC_PJ, EvaluateProjection,
4272  (_cuCameraData, _cuPointData, _cuImageProj));
4273  PROTILE2(FUNC_MPC, FUNC_MPP, ApplyBlockPC, (_cuVectorJtE, _cuVectorPK));
4274 
4276  if (!__no_jacobian_store) {
4277  if (__jc_store_original) {
4278  PROTILE(FUNC_JX, ComputeJX, (_cuVectorJtE, _cuVectorJX));
4279 
4280  if (__jc_store_transpose) {
4281  PROTILE(FUNC_JJ_JCO_JCT_JP, EvaluateJacobians, ());
4282  PROTILE2(FUNC_JTEC_JCT, FUNC_JTEP, ComputeJtE,
4283  (_cuImageProj, _cuVectorJtE));
4284  PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f, true));
4285  PROFILE(ComputeDiagonal, (_cuVectorPK));
4286 
4287  std::cout << "---------------------------------\n"
4288  "| Not storing original JC | \n"
4289  "---------------------------------\n";
4290  __jc_store_original = false;
4291  PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ());
4292  __jc_store_original = true;
4293  }
4294 
4296  std::cout << "---------------------------------\n"
4297  "| Not storing transpose JC | \n"
4298  "---------------------------------\n";
4299  __jc_store_transpose = false;
4300  _cuJacobianCameraT.resize(0);
4301  PROTILE(FUNC_JJ_JCO_JP, EvaluateJacobians, ());
4302  PROTILE2(FUNC_JTEC_JCO, FUNC_JTEP, ComputeJtE,
4303  (_cuImageProj, _cuVectorJtE));
4304  PROTILE2(FUNC_BCC_JCO, FUNC_BCP, ComputeBlockPC, (0.001f, true));
4305  PROFILE(ComputeDiagonal, (_cuVectorPK));
4306  } else if (__jc_store_transpose) {
4307  PROTILE2(FUNC_JTEC_JCT, FUNC_JTEP, ComputeJtE,
4308  (_cuImageProj, _cuVectorJtE));
4309  PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f, true));
4310  PROFILE(ComputeDiagonal, (_cuVectorPK));
4311 
4312  std::cout << "---------------------------------\n"
4313  "| Not storing original JC | \n"
4314  "---------------------------------\n";
4315  PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ());
4316  }
4317  }
4318 
4319  if (!__no_jacobian_store) {
4320  std::cout << "---------------------------------\n"
4321  "| Not storing Camera Jacobians | \n"
4322  "---------------------------------\n";
4323  __jc_store_transpose = false;
4324  __jc_store_original = false;
4325  _cuJacobianCamera.resize(0);
4326  _cuJacobianCameraT.resize(0);
4327  PROTILE(FUNC_JJ_JP, EvaluateJacobians, ());
4328  PROTILE(FUNC_JTE_, ComputeJtE, (_cuImageProj, _cuVectorJtE));
4329  // PROFILE(ComputeBlockPC, (0.001f, true));
4330  }
4331 
4333  std::cout << "---------------------------------\n"
4334  "| Not storing any jacobians |\n"
4335  "---------------------------------\n";
4336  __no_jacobian_store = true;
4337  _cuJacobianPoint.resize(0);
4338  PROTILE(FUNC_JX_, ComputeJX, (_cuVectorJtE, _cuVectorJX));
4339  PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
4340  PROFILE(ComputeBlockPC, (0.001f, true));
4341  std::cout << "---------------------------------\n";
4342 }
4343 
4344 template <class Float>
4346 #ifdef _WIN32
4347 #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
4348  SYSTEM_INFO sysinfo;
4349  GetNativeSystemInfo(&sysinfo);
4350 #else
4351  SYSTEM_INFO sysinfo;
4352  GetSystemInfo(&sysinfo);
4353 #endif
4354  return sysinfo.dwNumberOfProcessors;
4355 #else
4356  return sysconf(_SC_NPROCESSORS_ONLN);
4357 #endif
4358 }
4359 
4360 ParallelBA* NewSparseBundleCPU(bool dp, const int num_threads) {
4361 #ifndef SIMD_NO_DOUBLE
4362  if (dp)
4363  return new SparseBundleCPU<double>(num_threads);
4364  else
4365 #endif
4366  return new SparseBundleCPU<float>(num_threads);
4367 }
4368 
4369 } // namespace pba
int width
std::string name
void * X
Definition: SmallVector.cpp:45
#define PROXILE(A, B)
#define PROTILE(FID, A, B)
#define MYTHREAD
#define DEFINE_THREAD_DATA(X)
#define ALLOCATE_REQUIRED_DATA(NAME, num, channels)
#define PROTILE2(FID1, FID2, A, B)
#define END_THREAD_RPOC(X)
#define BEGIN_THREAD_PROC(X)
#define POINT_ALIGN
#define WAIT_THREAD(tv, n)
#define POINT_ALIGN2
#define PROFILE(A, B)
#define THREAD_NUM_MAX
#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option)
#define AUTO_MT_NUM(sz)
#define RUN_THREAD(X, t,...)
#define ALIGN_PTR(p)
#define FLOAT_ALIGN
#define NULL
core::Tensor result
Definition: VtkUtils.cpp:76
bool copy
Definition: VtkUtils.cpp:74
int __verbose_level
Definition: ConfigBA.h:121
int __cpu_data_precision
Definition: ConfigBA.h:153
void SaveBundleRecord(int iter, float res, float damping, float gn, float gi)
Definition: ConfigBA.cpp:329
@ FUNC_JJ_JCO_JCT_JP
Definition: ConfigBA.h:54
@ FUNC_JJ_JCO_JP
Definition: ConfigBA.h:55
@ FUNC_JJ_JCT_JP
Definition: ConfigBA.h:56
int __num_cpu_thread[NUM_FUNC]
Definition: ConfigBA.h:186
bool __multiply_jx_usenoj
Definition: ConfigBA.h:149
void SaveBundleRecord(int iter, float res, float damping, float &g_norm, float &g_inf)
virtual void SetProjection(size_t nproj, const Point2D *imgpts, const int *point_idx, const int *cam_idx)
void ComputeJtE(VectorF &E, VectorF &JtE, int mode=0)
bool ProcessIndexCameraQ(std::vector< int > &qmap, std::vector< int > &qlist)
void RunTestIterationLM(bool reduced)
int SolveNormalEquationPCGX(float lambda)
float EvaluateProjectionX(VectorF &cam, VectorF &point, VectorF &proj)
virtual int RunBundleAdjustment()
int SolveNormalEquationPCGB(float lambda)
float SaveUpdatedSystem(float residual_reduction, float dx_sqnorm, float damping)
virtual void SetCameraData(size_t ncam, CameraT *cams)
void ProcessWeightCameraQ(std::vector< int > &cpnum, std::vector< int > &qmap, Float *qmapw, Float *qlistw)
void ComputeBlockPC(float lambda, bool dampd)
virtual float GetMeanSquaredError()
virtual void SetPointData(size_t npoint, Point3D *pts)
void ComputeDiagonal(VectorF &JJI)
virtual void SetFocalMask(const int *fmask, float weight)
void ComputeJX(VectorF &X, VectorF &JX, int mode=0)
int SolveNormalEquation(float lambda)
void ApplyBlockPC(VectorF &v, VectorF &pv, int mode=0)
float UpdateCameraPoint(VectorF &dx, VectorF &cuImageTempProj)
float EvaluateProjection(VectorF &cam, VectorF &point, VectorF &proj)
void SaveToFile(const char *name)
Float * begin()
size_t size() const
Float * end()
Float * data()
void swap(avec< Float > &next)
void set(Float *data, size_t count)
__host__ __device__ float2 fabs(float2 v)
Definition: cutil_math.h:1254
int min(int a, int b)
Definition: cutil_math.h:53
int max(int a, int b)
Definition: cutil_math.h:48
static void error(char *msg)
Definition: lsd.c:159
MiniVec< float, N > floor(const MiniVec< float, N > &a)
Definition: MiniVec.h:75
double ComputeVectorDot(const avec< Float > &vec1, const avec< Float > &vec2)
void ComputeJX_(size_t nproj, size_t ncam, const Float *x, Float *jx, const Float *camera, const Float *point, const Float *ms, const Float *sj, const int *jmap, bool intrinsic_fixed, int radial_distortion, int mode, int mt=16)
void ComputeProjection(size_t nproj, const Float *camera, const Float *point, const Float *ms, const int *jmap, Float *pj, int radial, int mt)
void ComputeVectorNorm(const Float *p, const Float *pe, double *psum)
void ComputeProjectionX(size_t nproj, const Float *camera, const Float *point, const Float *ms, const int *jmap, Float *pj, int radial, int mt)
void AddScaledVec8(Float a, const Float *x, Float *v)
void ComputeVXY(const avec< Float > &vec1, const avec< Float > &vec2, avec< Float > &result, size_t part=0, size_t skip=0)
void ComputeSAXPY(Float a, const avec< Float > &vec1, const avec< Float > &vec2, avec< Float > &result, int mt=0)
void MemoryCopyA(const Float *p, const Float *pe, Float *d)
void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2, const Float *jc, const int *cmap, const int *cmlist, Float *di, Float *bi, int vn, bool jc_transpose, bool use_jq, int mt)
void ComputeSAX(Float a, const avec< Float > &vec1, avec< Float > &result)
double ComputeVectorNorm(const avec< Float > &vec, int mt=0)
void MultiplyBlockConditionerP(int npoint, const Float *bi, const Float *x, Float *vx, int mt=0)
void ComputeDiagonalBlock(size_t ncam, size_t npts, float lambda, bool dampd, const Float *jc, const int *cmap, const Float *jp, const int *pmap, const int *cmlist, const Float *sj, const Float *wq, Float *diag, Float *blocks, int radial_distortion, bool jc_transpose, int mt1=2, int mt2=2, int mode=0)
void MemoryCopyB(const Float *p, const Float *pe, Float *d)
void MultiplyBlockConditioner(int ncam, int npoint, const Float *blocksv, const Float *vec, Float *resultv, int radial, int mode, int mt1, int mt2)
void ComputeDiagonalAddQ(size_t ncam, const Float *qw, Float *d, const Float *sj=NULL)
Float DotProduct8(const Float *v1, const Float *v2)
void SetVectorZero(Float *p, Float *pe)
Float ComputeVectorMax(const avec< Float > &vec)
void ComputeJtEC(size_t ncam, const Float *pe, const Float *jc, const int *cmap, const int *cmlist, Float *v, bool jc_transpose, int mt)
void ComputeJQtEC(size_t ncam, const Float *pe, const int *qlist, const Float *wq, const Float *sj, Float *v)
void ComputeJX(size_t nproj, size_t ncam, const Float *x, const Float *jc, const Float *jp, const int *jmap, Float *jx, int mode, int mt=2)
double ComputeVectorNormW(const avec< Float > &vec, const avec< Float > &weight)
void MultiplyBlockConditionerC(int ncam, const Float *bi, const Float *x, Float *vx, int vn, int mt=0)
void ComputeSQRT(avec< Float > &vec)
void JacobianOne(const Float *c, const Float *pt, const Float *ms, Float *jxc, Float *jyc, Float *jxp, Float *jyp, bool intrinsic_fixed, int radial_distortion)
void ComputeJacobian(size_t nproj, size_t ncam, const Float *camera, const Float *point, Float *jc, Float *jp, const int *jmap, const Float *sj, const Float *ms, const int *cmlist, bool intrinsic_fixed, int radial_distortion, bool shuffle, Float *jct, int mt=2, int i0=0)
void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float *ee, Float *jte, const Float *camera, const Float *point, const Float *ms, const int *jmap, const int *cmap, const int *cmlist, const int *pmap, const Float *jp, bool intrinsic_fixed, int radial_distortion, int mode, int mt)
void ComputeTwoJX(const Float *jc, const Float *jp, const Float *xc, const Float *xp, Float *jx)
void ComputeJtEP(size_t npt, const Float *pe, const Float *jp, const int *pmap, Float *v, int mt)
void AddBlockJtJ(const Float *jc, Float *block, int vn)
void ComputeSXYPZ(Float a, const avec< Float > &vec1, const avec< Float > &vec2, const avec< Float > &vec3, avec< Float > &result)
void ComputeDiagonal(const avec< Float > &jcv, const vector< int > &cmapv, const avec< Float > &jpv, const vector< int > &pmapv, const vector< int > &cmlistv, const Float *qw0, avec< Float > &jtjdi, bool jc_transpose, int radial)
void ComputeRSQRT(avec< Float > &vec)
void ComputeJQX(size_t nq, const Float *x, const int *qmap, const Float *wq, const Float *sj, Float *jx)
void ComputeProjectionQ(size_t nq, const Float *camera, const int *qmap, const Float *wq, Float *pj)
void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2, const Float *jp, const int *pmap, Float *di, Float *bi, int mt)
void InvertSymmetricMatrix(T a[n][m], T ai[n][m])
void UpdateCamera(int ncam, const avec< Float > &camera, const avec< Float > &delta, avec< Float > &new_camera)
void UpdateCameraPoint(int ncam, const avec< Float > &camera, const avec< Float > &point, avec< Float > &delta, avec< Float > &new_camera, avec< Float > &new_point, int mode, int mt)
void ComputeJtEC_(size_t ncam, const Float *ee, Float *jte, const Float *c, const Float *point, const Float *ms, const int *jmap, const int *cmap, const int *cmlist, bool intrinsic_fixed, int radial_distortion, int mt)
void UncompressRodriguesRotation(const Float r[3], Float m[])
void GetRodriguesRotation(const Float m[3][3], Float r[3])
void ComputeJtE(size_t ncam, size_t npt, const Float *pe, const Float *jc, const int *cmap, const int *cmlist, const Float *jp, const int *pmap, Float *v, bool jc_transpose, int mode, int mt1, int mt2)
void ComputeDiagonalBlock_(float lambda, bool dampd, const avec< Float > &camerav, const avec< Float > &pointv, const avec< Float > &meas, const vector< int > &jmapv, const avec< Float > &sjv, avec< Float > &qwv, avec< Float > &diag, avec< Float > &blocks, bool intrinsic_fixed, int radial_distortion, int mode=0)
void ScaleJ8(Float *jcx, Float *jcy, const Float *sj)
void SetVectorZero(avec< Float > &vec)
void ComputeSAXPY(Float a, const Float *it1, const Float *it2, Float *it3, Float *ite)
void ComputeSXYPZ(Float a, const Float *p1, const Float *p2, const Float *p3, Float *p4, Float *pe)
Definition: ConfigBA.cpp:44
float float_t
ParallelBA * NewSparseBundleCPU(bool dp, const int num_threads)
double timer
Definition: struct.h:215
float_t m[3][3]
Definition: DataInterface.h:53
float_t t[3]
Definition: DataInterface.h:52
Definition: lsd.c:149