ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ProgramCU.cu
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////////
2 // File: ProgramCU.cu
3 // Author: Changchang Wu
4 // Description : implementation of ProgramCU and all CUDA kernels
5 //
6 // Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu)
7 // and the University of Washington at Seattle
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU General Public
11 // License as published by the Free Software Foundation; either
12 // Version 3 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // General Public License for more details.
18 //
19 ////////////////////////////////////////////////////////////////////////////////
20 
21 #include <stdio.h>
22 #include <float.h>
23 #include <cuda_runtime.h>
24 #include <unordered_map>
25 #include "CuTexImage.h"
26 #include "ProgramCU.h"
27 
28 #define IMUL(X, Y) ((X) * (Y))
29 #define FDIV(X, Y) __fdividef(X, Y)
30 #define FDIV2(X, Y) ((X) / (Y))
31 #define MAX_BLOCKLEN 65535
32 #define MAX_BLOCKLEN_ALIGN 65504
33 #define MAX_TEXSIZE (1 << 29)
34 #define TEX_TOOBIG4(sz) (sz >> 31)
35 #define REDUCTION_NBLOCK 32
36 
37 namespace pba {
38 // Helpers to create texture descriptors and channel descriptors (host-side)
39 static inline cudaTextureDesc PBA_MakeTexDesc(cudaTextureReadMode read_mode) {
40  cudaTextureDesc desc;
41  memset(&desc, 0, sizeof(desc));
42  desc.readMode = read_mode;
43  desc.addressMode[0] = cudaAddressModeClamp;
44  desc.addressMode[1] = cudaAddressModeClamp;
45  desc.addressMode[2] = cudaAddressModeClamp;
46  desc.filterMode = cudaFilterModePoint;
47  desc.normalizedCoords = false;
48  return desc;
49 }
50 
51 static inline cudaChannelFormatDesc PBA_ChanFloat() { return cudaCreateChannelDesc<float>(); }
52 static inline cudaChannelFormatDesc PBA_ChanFloat2() { return cudaCreateChannelDesc<float2>(); }
53 static inline cudaChannelFormatDesc PBA_ChanFloat4() { return cudaCreateChannelDesc<float4>(); }
54 static inline cudaChannelFormatDesc PBA_ChanInt() { return cudaCreateChannelDesc<int>(); }
55 static inline cudaChannelFormatDesc PBA_ChanInt2() { return cudaCreateChannelDesc<int2>(); }
56 
57 // Texture object cache for large-scale binding to avoid excessive creation
58 struct PBA_TexKey {
59  int device_id;
60  const void* dev_ptr;
61  size_t size_bytes;
62  int read_mode;
63  int x, y, z, w, f;
64  bool operator==(const PBA_TexKey& other) const {
65  return device_id == other.device_id &&
66  dev_ptr == other.dev_ptr && size_bytes == other.size_bytes && read_mode == other.read_mode &&
67  x == other.x && y == other.y && z == other.z && w == other.w && f == other.f;
68  }
69 };
70 
71 struct PBA_TexKeyHasher {
72  size_t operator()(const PBA_TexKey& k) const {
73  auto h = std::hash<int>()(k.device_id);
74  size_t addr = reinterpret_cast<size_t>(k.dev_ptr);
75  h ^= std::hash<size_t>()(addr + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2));
76  h ^= std::hash<size_t>()(k.size_bytes + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2));
77  h ^= std::hash<int>()(k.read_mode + 0x9e3779b9 + (h<<6) + (h>>2));
78  h ^= std::hash<int>()(k.x + (h<<6) + (h>>2));
79  h ^= std::hash<int>()(k.y + (h<<6) + (h>>2));
80  h ^= std::hash<int>()(k.z + (h<<6) + (h>>2));
81  h ^= std::hash<int>()(k.w + (h<<6) + (h>>2));
82  h ^= std::hash<int>()(k.f + (h<<6) + (h>>2));
83  return h;
84  }
85 };
86 
87 static std::unordered_map<PBA_TexKey, cudaTextureObject_t, PBA_TexKeyHasher>& PBA_GetTexCache() {
88  static std::unordered_map<PBA_TexKey, cudaTextureObject_t, PBA_TexKeyHasher> cache;
89  return cache;
90 }
91 
92 static cudaTextureObject_t PBA_AcquireTextureObject1D(CuTexImage& img,
93  const cudaTextureDesc& tex_desc,
94  const cudaChannelFormatDesc& ch_desc) {
95  const void* dev_ptr = img.data();
96  const size_t size_bytes = img.GetDataSize();
97  if (dev_ptr == nullptr || size_bytes == 0) return 0;
98  int device_id = 0;
99  cudaGetDevice(&device_id);
100  PBA_TexKey key{device_id, dev_ptr, size_bytes, (int)tex_desc.readMode, ch_desc.x, ch_desc.y, ch_desc.z, ch_desc.w, ch_desc.f};
101  auto& cache = PBA_GetTexCache();
102  auto it = cache.find(key);
103  if (it != cache.end()) return it->second;
104  cudaResourceDesc res_desc{};
105  memset(&res_desc, 0, sizeof(res_desc));
106  res_desc.resType = cudaResourceTypeLinear;
107  res_desc.res.linear.devPtr = const_cast<void*>(dev_ptr);
108  res_desc.res.linear.desc = ch_desc;
109  res_desc.res.linear.sizeInBytes = size_bytes;
110  cudaTextureObject_t handle = 0;
111  cudaCreateTextureObject(&handle, &res_desc, &tex_desc, nullptr);
112  cache.emplace(key, handle);
113  return handle;
114 }
115 
116 static cudaTextureObject_t PBA_AcquireTextureObject1DRange(const void* base_dev_ptr,
117  size_t byte_offset,
118  size_t size_bytes,
119  const cudaTextureDesc& tex_desc,
120  const cudaChannelFormatDesc& ch_desc) {
121  if (base_dev_ptr == nullptr || size_bytes == 0) return 0;
122  int device_id = 0;
123  cudaGetDevice(&device_id);
124  PBA_TexKey key{device_id, (const char*)base_dev_ptr + byte_offset, size_bytes,
125  (int)tex_desc.readMode, ch_desc.x, ch_desc.y, ch_desc.z, ch_desc.w, ch_desc.f};
126  auto& cache = PBA_GetTexCache();
127  auto it = cache.find(key);
128  if (it != cache.end()) return it->second;
129  cudaResourceDesc res_desc{};
130  memset(&res_desc, 0, sizeof(res_desc));
131  res_desc.resType = cudaResourceTypeLinear;
132  res_desc.res.linear.devPtr = (void*)((const char*)base_dev_ptr + byte_offset);
133  res_desc.res.linear.desc = ch_desc;
134  res_desc.res.linear.sizeInBytes = size_bytes;
135  cudaTextureObject_t handle = 0;
136  cudaCreateTextureObject(&handle, &res_desc, &tex_desc, nullptr);
137  cache.emplace(key, handle);
138  return handle;
139 }
140 
141 static void PBA_ClearTextureObjectCache() {
142  auto& cache = PBA_GetTexCache();
143  int prev_device = 0;
144  cudaGetDevice(&prev_device);
145  for (auto& kv : cache) {
146  // set device to the device that owns this texture object
147  int dev = kv.first.device_id;
148  if (dev != prev_device) cudaSetDevice(dev);
149  if (kv.second) cudaDestroyTextureObject(kv.second);
150  }
151  if (!cache.empty() && prev_device >= 0) cudaSetDevice(prev_device);
152  cache.clear();
153 }
154 
155 // Device-side texture object handles
156 __device__ cudaTextureObject_t tex_jacobian_cam;
157 __device__ cudaTextureObject_t tex_jacobian_pts;
158 __device__ cudaTextureObject_t tex_jacobian_idx;
159 __device__ cudaTextureObject_t tex_jacobian_meas;
160 __device__ cudaTextureObject_t tex_jacobian_sj;
161 __device__ cudaTextureObject_t tex_jacobian_shuffle;
162 
163 __device__ cudaTextureObject_t tex_compact_cam;
164 __device__ cudaTextureObject_t tex_uncompressed_cam;
165 
166 __device__ cudaTextureObject_t tex_update_cam;
167 __device__ cudaTextureObject_t tex_update_cam_delta;
168 
169 __device__ cudaTextureObject_t tex_projection_cam;
170 __device__ cudaTextureObject_t tex_projection_idx;
171 __device__ cudaTextureObject_t tex_projection_pts;
172 __device__ cudaTextureObject_t tex_projection_mea;
173 
174 __device__ cudaTextureObject_t tex_jte_pe;
175 __device__ cudaTextureObject_t tex_jte_pex;
176 __device__ cudaTextureObject_t tex_jte_jc;
177 __device__ cudaTextureObject_t tex_jte_jc2;
178 __device__ cudaTextureObject_t tex_jte_cmp;
179 __device__ cudaTextureObject_t tex_jte_cmt;
180 __device__ cudaTextureObject_t tex_jte_jc3;
181 __device__ cudaTextureObject_t tex_jte_jc4;
182 __device__ cudaTextureObject_t tex_jte_jp;
183 __device__ cudaTextureObject_t tex_jte_pmp;
184 __device__ cudaTextureObject_t tex_jte_jp2;
185 
186 __device__ cudaTextureObject_t tex_jtjd_cmp;
187 __device__ cudaTextureObject_t tex_jtjd_cmlist;
188 __device__ cudaTextureObject_t tex_jtjd_jp;
189 __device__ cudaTextureObject_t tex_jtjd_pmp;
190 __device__ cudaTextureObject_t tex_jtjd_jp2;
191 
192 __device__ cudaTextureObject_t tex_shuffle_jc;
193 __device__ cudaTextureObject_t tex_shuffle_map;
194 __device__ cudaTextureObject_t tex_shuffle_jc2;
195 
196 __device__ cudaTextureObject_t tex_mjx_jc;
197 __device__ cudaTextureObject_t tex_mjx_jc2;
198 __device__ cudaTextureObject_t tex_mjx_jc3;
199 __device__ cudaTextureObject_t tex_mjx_jc4;
200 __device__ cudaTextureObject_t tex_mjx_jp;
201 __device__ cudaTextureObject_t tex_mjx_jp2;
202 __device__ cudaTextureObject_t tex_mjx_idx;
203 __device__ cudaTextureObject_t tex_mjx_x;
204 
205 __device__ cudaTextureObject_t tex_jte_q_idx;
206 __device__ cudaTextureObject_t tex_jte_q_w;
207 
208 // Macro to bind a CuTexImage to a device-side texture object symbol using cache
209 // Avoid frequent cudaMemcpyToSymbol if handle unchanged
210 #define PBA_SET_TEX_SYMBOL(sym, handle) \
211  do { static cudaTextureObject_t __last_##sym = 0; \
212  if (__last_##sym != (handle)) { cudaMemcpyToSymbol(sym, &(handle), sizeof(handle)); \
213  __last_##sym = (handle); } } while (0)
214 
215 #define PBA_BIND_TEX1D(sym, img, read_mode, chdesc) \
216  do { \
217  auto __tex_desc = PBA_MakeTexDesc(read_mode); \
218  auto __chan_desc = chdesc; \
219  cudaTextureObject_t __h = PBA_AcquireTextureObject1D(img, __tex_desc, __chan_desc); \
220  PBA_SET_TEX_SYMBOL(sym, __h); \
221  } while (0)
222 
223 // Bind CuTexImage as two/ four linear segments
224 #define PBA_BIND_TEX1D_2(sym1, sym2, img, read_mode, chdesc) \
225  do { \
226  auto __tex_desc = PBA_MakeTexDesc(read_mode); \
227  auto __chan_desc = chdesc; \
228  size_t __size = img.GetDataSize(); \
229  const void* __base = img.data(); \
230  size_t __elem_bits = (size_t)abs(__chan_desc.x) + abs(__chan_desc.y) + abs(__chan_desc.z) + abs(__chan_desc.w); \
231  size_t __elem_size = (__elem_bits >> 3); \
232  size_t __chunk = MAX_TEXSIZE - (MAX_TEXSIZE % (__elem_size ? __elem_size : 1)); \
233  size_t __sz0 = __size > __chunk ? __chunk : __size; \
234  cudaTextureObject_t __h1 = PBA_AcquireTextureObject1DRange(__base, 0, __sz0, __tex_desc, __chan_desc); \
235  PBA_SET_TEX_SYMBOL(sym1, __h1); \
236  if (__size > __chunk) { \
237  size_t __sz1 = __size - __chunk; \
238  cudaTextureObject_t __h2 = PBA_AcquireTextureObject1DRange(__base, __chunk, __sz1, __tex_desc, __chan_desc); \
239  PBA_SET_TEX_SYMBOL(sym2, __h2); \
240  } \
241  } while (0)
242 
243 #define PBA_BIND_TEX1D_4(sym1, sym2, sym3, sym4, img, read_mode, chdesc) \
244  do { \
245  auto __tex_desc = PBA_MakeTexDesc(read_mode); \
246  auto __chan_desc = chdesc; \
247  size_t __size = img.GetDataSize(); \
248  const void* __base = img.data(); \
249  size_t __elem_bits = (size_t)abs(__chan_desc.x) + abs(__chan_desc.y) + abs(__chan_desc.z) + abs(__chan_desc.w); \
250  size_t __elem_size = (__elem_bits >> 3); \
251  size_t __chunk = MAX_TEXSIZE - (MAX_TEXSIZE % (__elem_size ? __elem_size : 1)); \
252  size_t __off = 0; \
253  size_t __rem = __size; \
254  cudaTextureObject_t __h1 = PBA_AcquireTextureObject1DRange(__base, __off, \
255  (__rem > __chunk ? __chunk : __rem), __tex_desc, __chan_desc); \
256  PBA_SET_TEX_SYMBOL(sym1, __h1); \
257  __off += (__rem > __chunk ? __chunk : __rem); \
258  __rem = (__rem > __chunk ? __rem - __chunk : 0); \
259  if (__rem > 0) { \
260  cudaTextureObject_t __h2 = PBA_AcquireTextureObject1DRange(__base, __off, \
261  (__rem > __chunk ? __chunk : __rem), __tex_desc, __chan_desc); \
262  PBA_SET_TEX_SYMBOL(sym2, __h2); \
263  __off += (__rem > __chunk ? __chunk : __rem); \
264  __rem = (__rem > __chunk ? __rem - __chunk : 0); \
265  } \
266  if (__rem > 0) { \
267  cudaTextureObject_t __h3 = PBA_AcquireTextureObject1DRange(__base, __off, \
268  (__rem > __chunk ? __chunk : __rem), __tex_desc, __chan_desc); \
269  PBA_SET_TEX_SYMBOL(sym3, __h3); \
270  __off += (__rem > __chunk ? __chunk : __rem); \
271  __rem = (__rem > __chunk ? __rem - __chunk : 0); \
272  } \
273  if (__rem > 0) { \
274  cudaTextureObject_t __h4 = PBA_AcquireTextureObject1DRange(__base, __off, __rem, \
275  __tex_desc, __chan_desc); \
276  PBA_SET_TEX_SYMBOL(sym4, __h4); \
277  } \
278  } while (0)
279 
280 // Expand untyped tex1Dfetch(tex_symbol, idx) into typed form via token pasting
281 #define tex1Dfetch(tex, idx) tex1Dfetch_##tex(idx)
282 #define tex1Dfetch_tex_jacobian_cam(i) tex1Dfetch<float4>(tex_jacobian_cam, i)
283 #define tex1Dfetch_tex_jacobian_pts(i) tex1Dfetch<float4>(tex_jacobian_pts, i)
284 #define tex1Dfetch_tex_jacobian_idx(i) tex1Dfetch<int2>(tex_jacobian_idx, i)
285 #define tex1Dfetch_tex_jacobian_meas(i) tex1Dfetch<float2>(tex_jacobian_meas, i)
286 #define tex1Dfetch_tex_jacobian_sj(i) tex1Dfetch<float4>(tex_jacobian_sj, i)
287 #define tex1Dfetch_tex_jacobian_shuffle(i) tex1Dfetch<int>(tex_jacobian_shuffle, i)
288 #define tex1Dfetch_tex_compact_cam(i) tex1Dfetch<float4>(tex_compact_cam, i)
289 #define tex1Dfetch_tex_uncompressed_cam(i) tex1Dfetch<float4>(tex_uncompressed_cam, i)
290 #define tex1Dfetch_tex_update_cam(i) tex1Dfetch<float4>(tex_update_cam, i)
291 #define tex1Dfetch_tex_update_cam_delta(i) tex1Dfetch<float4>(tex_update_cam_delta, i)
292 #define tex1Dfetch_tex_projection_cam(i) tex1Dfetch<float4>(tex_projection_cam, i)
293 #define tex1Dfetch_tex_projection_idx(i) tex1Dfetch<int2>(tex_projection_idx, i)
294 #define tex1Dfetch_tex_projection_pts(i) tex1Dfetch<float4>(tex_projection_pts, i)
295 #define tex1Dfetch_tex_projection_mea(i) tex1Dfetch<float2>(tex_projection_mea, i)
296 #define tex1Dfetch_tex_jte_pe(i) tex1Dfetch<float2>(tex_jte_pe, i)
297 #define tex1Dfetch_tex_jte_pex(i) tex1Dfetch<float>(tex_jte_pex, i)
298 #define tex1Dfetch_tex_jte_jc(i) tex1Dfetch<float4>(tex_jte_jc, i)
299 #define tex1Dfetch_tex_jte_jc2(i) tex1Dfetch<float4>(tex_jte_jc2, i)
300 #define tex1Dfetch_tex_jte_cmp(i) tex1Dfetch<int>(tex_jte_cmp, i)
301 #define tex1Dfetch_tex_jte_cmt(i) tex1Dfetch<int>(tex_jte_cmt, i)
302 #define tex1Dfetch_tex_jte_jc3(i) tex1Dfetch<float4>(tex_jte_jc3, i)
303 #define tex1Dfetch_tex_jte_jc4(i) tex1Dfetch<float4>(tex_jte_jc4, i)
304 #define tex1Dfetch_tex_jte_jp(i) tex1Dfetch<float4>(tex_jte_jp, i)
305 #define tex1Dfetch_tex_jte_pmp(i) tex1Dfetch<int>(tex_jte_pmp, i)
306 #define tex1Dfetch_tex_jte_jp2(i) tex1Dfetch<float4>(tex_jte_jp2, i)
307 #define tex1Dfetch_tex_jtjd_cmp(i) tex1Dfetch<int>(tex_jtjd_cmp, i)
308 #define tex1Dfetch_tex_jtjd_cmlist(i) tex1Dfetch<int>(tex_jtjd_cmlist, i)
309 #define tex1Dfetch_tex_jtjd_jp(i) tex1Dfetch<float4>(tex_jtjd_jp, i)
310 #define tex1Dfetch_tex_jtjd_pmp(i) tex1Dfetch<int>(tex_jtjd_pmp, i)
311 #define tex1Dfetch_tex_jtjd_jp2(i) tex1Dfetch<float4>(tex_jtjd_jp2, i)
312 #define tex1Dfetch_tex_shuffle_jc(i) tex1Dfetch<float4>(tex_shuffle_jc, i)
313 #define tex1Dfetch_tex_shuffle_map(i) tex1Dfetch<int>(tex_shuffle_map, i)
314 #define tex1Dfetch_tex_shuffle_jc2(i) tex1Dfetch<float4>(tex_shuffle_jc2, i)
315 #define tex1Dfetch_tex_mjx_jc(i) tex1Dfetch<float4>(tex_mjx_jc, i)
316 #define tex1Dfetch_tex_mjx_jc2(i) tex1Dfetch<float4>(tex_mjx_jc2, i)
317 #define tex1Dfetch_tex_mjx_jc3(i) tex1Dfetch<float4>(tex_mjx_jc3, i)
318 #define tex1Dfetch_tex_mjx_jc4(i) tex1Dfetch<float4>(tex_mjx_jc4, i)
319 #define tex1Dfetch_tex_mjx_jp(i) tex1Dfetch<float4>(tex_mjx_jp, i)
320 #define tex1Dfetch_tex_mjx_jp2(i) tex1Dfetch<float4>(tex_mjx_jp2, i)
321 #define tex1Dfetch_tex_mjx_idx(i) tex1Dfetch<int2>(tex_mjx_idx, i)
322 #define tex1Dfetch_tex_mjx_x(i) tex1Dfetch<float4>(tex_mjx_x, i)
323 #define tex1Dfetch_tex_jte_q_idx(i) tex1Dfetch<int2>(tex_jte_q_idx, i)
324 #define tex1Dfetch_tex_jte_q_w(i) tex1Dfetch<float2>(tex_jte_q_w, i)
325 
326 void ProgramCU::FinishWorkCUDA() { cudaDeviceSynchronize(); }
327 
328 int ProgramCU::CheckErrorCUDA(const char* location) {
329  cudaError_t e = cudaGetLastError();
330  if (e) {
331  if (location) fprintf(stderr, "%s:\t", location);
332  fprintf(stderr, "%s(%d)\n", cudaGetErrorString(e), e);
333  throw location;
334  } else {
335  // fprintf(stderr, "%s:\n", location);
336  return 0;
337  }
338 }
339 
340 inline void ProgramCU::GetBlockConfiguration(unsigned int nblock,
341  unsigned int& bw,
342  unsigned int& bh) {
343  if (nblock <= MAX_BLOCKLEN) {
344  bw = nblock;
345  bh = 1;
346  } else {
347  bh = (nblock + MAX_BLOCKLEN_ALIGN - 1) / MAX_BLOCKLEN_ALIGN;
348  bw = (nblock + bh - 1) / bh;
349  bw = ((bw + 31) / 32) * 32;
350  bh = (nblock + bw - 1) / bw;
351  }
352 }
353 
354 void ProgramCU::ClearPreviousError() { cudaGetLastError(); }
355 
356 void ProgramCU::ResetCurrentDevice() {
357  int device = 0;
358  cudaGetDevice(&device);
359  cudaDeviceReset();
360  if (device > 0) cudaSetDevice(device);
361 }
362 
363 void ProgramCU::ClearTextureObjectCache() { PBA_ClearTextureObjectCache(); }
364 
365 size_t ProgramCU::GetCudaMemoryCap() {
366  int device;
367  if (cudaGetDevice(&device) != cudaSuccess) return 0;
368  cudaDeviceProp prop;
369  if (cudaGetDeviceProperties(&prop, device) == cudaSuccess) {
370  if (prop.major == 9999 && prop.minor == 9999) return 0;
371  return prop.totalGlobalMem;
372  } else
373  return 0;
374 }
375 int ProgramCU::SetCudaDevice(int device) {
376  int count = 0, device_used;
377  if (cudaGetDeviceCount(&count) || count <= 0) {
378  ProgramCU::CheckErrorCUDA("CheckCudaDevice");
379  return 0;
380  } else if (count == 1) {
381  cudaDeviceProp deviceProp;
382  if (cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess) {
383  fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
384  return 0;
385  }
386  if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
387  fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
388  return 0;
389  }
390  }
391 
392  if (device > 0 && device < count) {
393  cudaSetDevice(device);
394  CheckErrorCUDA("cudaSetDevice\n");
395  }
396  cudaGetDevice(&device_used);
397  if (device != device_used)
398  fprintf(stderr,
399  "ERROR: Cannot set device to %d\n"
400  "WARNING: Use device-%d instead (out of %d)\n",
401  device, device_used, count);
402  return 1;
403 }
404 
405 #define WARP_REDUCTION_32(value) \
406  __syncthreads(); \
407  if (threadIdx.x < 16) value[threadIdx.x] += value[threadIdx.x + 16]; \
408  if (threadIdx.x < 8) value[threadIdx.x] += value[threadIdx.x + 8]; \
409  if (threadIdx.x < 4) value[threadIdx.x] += value[threadIdx.x + 4]; \
410  if (threadIdx.x < 2) value[threadIdx.x] += value[threadIdx.x + 2];
411 
412 #define WARP_REDUCTION_64(value) \
413  __syncthreads(); \
414  if (threadIdx.x < 32) value[threadIdx.x] += value[threadIdx.x + 32]; \
415  WARP_REDUCTION_32(value)
416 
417 #define WARP_REDUCTION_128(value) \
418  __syncthreads(); \
419  if (threadIdx.x < 64) value[threadIdx.x] += value[threadIdx.x + 64]; \
420  WARP_REDUCTION_64(value)
421 
422 #define WARP_REDUCTION_256(value) \
423  __syncthreads(); \
424  if (threadIdx.x < 128) value[threadIdx.x] += value[threadIdx.x + 128]; \
425  WARP_REDUCTION_128(value)
426 
427 __global__ void vector_max_kernel(const float* x, int len, int blen,
428  float* result) {
429  __shared__ float value[256];
430  int bstart = blen * blockIdx.x;
431  int start = bstart + threadIdx.x;
432  int end = min(len, bstart + blen);
433 
434  float v = 0;
435  for (int i = start; i < end; i += blockDim.x) v = max(v, fabs(x[i]));
436  value[threadIdx.x] = v;
437  // reduce to the first two values
438  __syncthreads();
439  if (threadIdx.x < 128)
440  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 128]);
441  __syncthreads();
442  if (threadIdx.x < 64)
443  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 64]);
444  __syncthreads();
445  if (threadIdx.x < 32)
446  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 32]);
447  if (threadIdx.x < 16)
448  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 16]);
449  if (threadIdx.x < 8)
450  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 8]);
451  if (threadIdx.x < 4)
452  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 4]);
453  if (threadIdx.x < 2)
454  value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 2]);
455  // write back
456  if (threadIdx.x == 0) result[blockIdx.x] = max(value[0], value[1]);
457 }
458 
459 float ProgramCU::ComputeVectorMax(CuTexImage& vector, CuTexImage& buf) {
460  const unsigned int nblock = 32;
461  const unsigned int bsize = 256;
462  int len = vector.GetLength();
463  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
464 
465  ////////////////////////////////
466  dim3 grid(nblock), block(bsize);
467 
468  /////////////////////////////////
469  buf.InitTexture(nblock, 1);
470  vector_max_kernel<<<grid, block>>>(vector.data(), len, blen, buf.data());
471  ProgramCU::CheckErrorCUDA("ComputeVectorMax");
472 
473  float data[nblock], result = 0;
474  buf.CopyToHost(data);
475  for (unsigned int i = 0; i < nblock; ++i) result = max(result, data[i]);
476  return result;
477 }
478 
479 __global__ void vector_norm_kernel(const float* x, int len, int blen,
480  float* result) {
481  __shared__ float value[256];
482  int bstart = blen * blockIdx.x;
483  int start = bstart + threadIdx.x;
484  int end = min(len, bstart + blen);
485 
486  float v = 0;
487  for (int i = start; i < end; i += blockDim.x) {
488  float temp = x[i];
489  v += (temp * temp);
490  }
491  value[threadIdx.x] = v;
492  // reduce to the first two values
493  WARP_REDUCTION_256(value);
494 
495  // write back
496  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
497 }
498 
499 double ProgramCU::ComputeVectorNorm(CuTexImage& vector, CuTexImage& buf) {
500  const unsigned int nblock = REDUCTION_NBLOCK;
501  unsigned int bsize = 256;
502  int len = vector.GetLength();
503  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
504 
505  ////////////////////////////////
506  dim3 grid(nblock), block(bsize);
507 
508  /////////////////////////////////
509  buf.InitTexture(nblock, 1);
510  vector_norm_kernel<<<grid, block>>>(vector.data(), len, blen, buf.data());
511  ProgramCU::CheckErrorCUDA("ComputeVectorNorm");
512 
513  float data[nblock];
514  buf.CopyToHost(data);
515  double result = 0;
516  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
517  return result;
518 }
519 
520 __global__ void vector_sum_kernel(const float* x, int len, int blen,
521  float* result) {
522  __shared__ float value[256];
523  int bstart = blen * blockIdx.x;
524  int start = bstart + threadIdx.x;
525  int end = min(len, bstart + blen);
526  float v = 0;
527  for (int i = start; i < end; i += blockDim.x) v += x[i];
528 
529  value[threadIdx.x] = v;
530  // reduce to the first two values
531  WARP_REDUCTION_256(value);
532 
533  // write back
534  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
535 }
536 
537 float ProgramCU::ComputeVectorSum(CuTexImage& vector, CuTexImage& buf,
538  int skip) {
539  const unsigned int nblock = REDUCTION_NBLOCK;
540  unsigned int bsize = 256;
541  int len = vector.GetLength() - skip;
542  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
543 
544  ////////////////////////////////
545  dim3 grid(nblock), block(bsize);
546 
547  /////////////////////////////////
548  buf.InitTexture(nblock, 1);
549  vector_sum_kernel<<<grid, block>>>((vector.data()) + skip, len, blen,
550  buf.data());
551  ProgramCU::CheckErrorCUDA("ComputeVectorSum");
552 
553  float data[nblock];
554  buf.CopyToHost(data);
555  double result = 0;
556  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
557  return (float)result;
558 }
559 
560 __global__ void vector_dotproduct_kernel(const float* a, const float* b,
561  int len, int blen, float* result) {
562  __shared__ float value[256];
563  int bstart = blen * blockIdx.x;
564  int start = bstart + threadIdx.x;
565  int end = min(len, bstart + blen);
566 
567  float v = 0;
568  for (int i = start; i < end; i += blockDim.x) v += (a[i] * b[i]);
569  value[threadIdx.x] = v;
570 
571  // reduce to the first two values
572  WARP_REDUCTION_256(value);
573 
574  // write back
575  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
576 }
577 
578 double ProgramCU::ComputeVectorDot(CuTexImage& vector1, CuTexImage& vector2,
579  CuTexImage& buf) {
580  const unsigned int nblock = REDUCTION_NBLOCK;
581  unsigned int bsize = 256;
582  int len = vector1.GetLength();
583  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
584 
585  ////////////////////////////////
586  dim3 grid(nblock), block(bsize);
587 
588  /////////////////////////////////
589  buf.InitTexture(nblock, 1);
590  vector_dotproduct_kernel<<<grid, block>>>(vector1.data(), vector2.data(), len,
591  blen, buf.data());
592  ProgramCU::CheckErrorCUDA("ComputeVectorDot");
593 
594  float data[nblock];
595  buf.CopyToHost(data);
596 
597  double result = 0;
598  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
599  return result;
600 }
601 
602 __global__ void vector_weighted_norm_kernel(const float* vec, const float* w,
603  int len, int blen, float* result) {
604  __shared__ float value[256];
605  int bstart = blen * blockIdx.x;
606  int start = bstart + threadIdx.x;
607  int end = min(len, bstart + blen);
608 
609  float v = 0;
610  for (int i = start; i < end; i += blockDim.x) v += (vec[i] * w[i] * vec[i]);
611  value[threadIdx.x] = v;
612 
613  // reduce to the first two values
614  WARP_REDUCTION_256(value);
615 
616  // write back
617  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
618 }
619 
620 double ProgramCU::ComputeVectorNormW(CuTexImage& vector, CuTexImage& weight,
621  CuTexImage& buf) {
622  if (weight.IsValid()) {
623  const unsigned int nblock = REDUCTION_NBLOCK;
624  unsigned int bsize = 256;
625  int len = vector.GetLength();
626  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
627 
628  ////////////////////////////////
629  dim3 grid(nblock), block(bsize);
630 
631  /////////////////////////////////
632  buf.InitTexture(nblock, 1);
633 
634  vector_weighted_norm_kernel<<<grid, block>>>(vector.data(), weight.data(),
635  len, blen, buf.data());
636 
637  ProgramCU::CheckErrorCUDA("ComputeVectorNormW");
638 
639  float data[nblock];
640  buf.CopyToHost(data);
641 
642  double result = 0;
643  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
644  return result;
645  } else {
646  return ComputeVectorNorm(vector, buf);
647  }
648 }
649 // given vector x, y, and a weight a
650 // return a * x + y
651 __global__ void saxpy_kernel(const float a, const float* x, const float* y,
652  float* result, unsigned int len) {
653  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
654  if (idx < len) result[idx] = a * x[idx] + y[idx];
655 }
656 
657 __global__ void saxpy_kernel_large(const float a, const float* x,
658  const float* y, float* result,
659  unsigned int len, unsigned int rowsz) {
660  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
661  if (idx < len) result[idx] = a * x[idx] + y[idx];
662 }
663 
664 void ProgramCU::ComputeSAXPY(float a, CuTexImage& texX, CuTexImage& texY,
665  CuTexImage& result) {
666  unsigned int len = result.GetLength();
667  unsigned int bsize = 128;
668  unsigned int nblock = (len + bsize - 1) / bsize;
669  if (nblock > MAX_BLOCKLEN) {
670  unsigned int bw, bh;
671  GetBlockConfiguration(nblock, bw, bh);
672  dim3 grid(bw, bh), block(bsize);
673  saxpy_kernel_large<<<grid, block>>>(a, texX.data(), texY.data(),
674  result.data(), len, bw * bsize);
675  } else {
676  dim3 grid(nblock), block(bsize);
677  saxpy_kernel<<<grid, block>>>(a, texX.data(), texY.data(), result.data(),
678  len);
679  }
680  ProgramCU::CheckErrorCUDA("ComputeSAXPY");
681 }
682 
683 __global__ void sxypz_kernel_large(float a, const float* x, const float* y,
684  const float* z, float* result,
685  unsigned int len, unsigned int rowsz) {
686  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
687  if (idx < len) result[idx] = a * x[idx] * y[idx] + z[idx];
688 }
689 
690 void ProgramCU::ComputeSXYPZ(float a, CuTexImage& texX, CuTexImage& texY,
691  CuTexImage& texZ, CuTexImage& result) {
692  if (texX.IsValid()) {
693  unsigned int len = texX.GetLength();
694  unsigned int bsize = 128;
695  unsigned int nblock = (len + bsize - 1) / bsize;
696  unsigned int bw, bh;
697  GetBlockConfiguration(nblock, bw, bh);
698  dim3 grid(bw, bh), block(bsize);
699  sxypz_kernel_large<<<grid, block>>>(a, texX.data(), texY.data(),
700  texZ.data(), result.data(), len,
701  bw * bsize);
702  } else {
703  ComputeSAXPY(a, texY, texZ, result);
704  }
705 }
706 
707 __global__ void vxy_kernel(const float* x, float* y, float* result,
708  unsigned int len) {
709  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
710  if (idx < len) result[idx] = x[idx] * y[idx];
711 }
712 
713 __global__ void vxy_kernel_large(const float* x, float* y, float* result,
714  unsigned int len, unsigned int rowsz) {
715  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + rowsz * blockIdx.y;
716  if (idx < len) result[idx] = x[idx] * y[idx];
717 }
718 
719 void ProgramCU::ComputeVXY(CuTexImage& texX, CuTexImage& texY,
720  CuTexImage& result, unsigned int part,
721  unsigned int skip) {
722  unsigned int len = part ? part : texX.GetLength();
723  unsigned int bsize = 128;
724  unsigned int nblock = (len + bsize - 1) / bsize;
725  if (nblock > MAX_BLOCKLEN) {
726  unsigned int bw, bh;
727  GetBlockConfiguration(nblock, bw, bh);
728  dim3 grid(bw, bh), block(bsize);
729  vxy_kernel_large<<<grid, block>>>(texX.data() + skip, texY.data() + skip,
730  result.data() + skip, len, bsize * bw);
731  } else {
732  dim3 grid(nblock), block(bsize);
733  vxy_kernel<<<grid, block>>>(texX.data() + skip, texY.data() + skip,
734  result.data() + skip, len);
735  }
736  ProgramCU::CheckErrorCUDA("ComputeVXY");
737 }
738 
739 __global__ void sqrt_kernel_large(float* x, unsigned int len,
740  unsigned int rowsz) {
741  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
742  if (idx < len) x[idx] = sqrt(x[idx]);
743 }
744 
745 void ProgramCU::ComputeSQRT(CuTexImage& tex) {
746  unsigned int len = tex.GetLength();
747  unsigned int bsize = 128;
748  unsigned int nblock = (len + bsize - 1) / bsize;
749  unsigned int bw, bh;
750  GetBlockConfiguration(nblock, bw, bh);
751  dim3 grid(bw, bh), block(bsize);
752  sqrt_kernel_large<<<grid, block>>>(tex.data(), len, bw * bsize);
753  ProgramCU::CheckErrorCUDA("ComputeSQRT");
754 }
755 
756 __global__ void rsqrt_kernel_large(float* x, unsigned int len,
757  unsigned int rowsz) {
758  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
759  if (idx < len) x[idx] = x[idx] > 0 ? rsqrt(x[idx]) : 0;
760 }
761 
762 void ProgramCU::ComputeRSQRT(CuTexImage& tex) {
763  unsigned int len = tex.GetLength();
764  unsigned int bsize = 128;
765  unsigned int nblock = (len + bsize - 1) / bsize;
766  unsigned int bw, bh;
767  GetBlockConfiguration(nblock, bw, bh);
768  dim3 grid(bw, bh), block(bsize);
769  rsqrt_kernel_large<<<grid, block>>>(tex.data(), len, bw * bsize);
770 
771  ProgramCU::CheckErrorCUDA("ComputeRSQRT");
772 }
773 
774 __global__ void sax_kernel(const float a, const float* x, float* result,
775  unsigned int len) {
776  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
777  if (idx < len) result[idx] = a * x[idx];
778 }
779 
780 __global__ void sax_kernel_large(const float a, const float* x, float* result,
781  unsigned int len, unsigned int rowsz) {
782  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
783  if (idx < len) result[idx] = a * x[idx];
784 }
785 
786 void ProgramCU::ComputeSAX(float a, CuTexImage& texX, CuTexImage& result) {
787  unsigned int len = texX.GetLength();
788  unsigned int bsize = 128;
789  unsigned int nblock = (len + bsize - 1) / bsize;
790 
791  if (nblock > MAX_BLOCKLEN) {
792  unsigned int bw, bh;
793  GetBlockConfiguration(nblock, bw, bh);
794  dim3 grid(bw, bh), block(bsize);
795  sax_kernel_large<<<grid, block>>>(a, texX.data(), result.data(), len,
796  bw * bsize);
797  } else {
798  dim3 grid(nblock), block(bsize);
799  sax_kernel<<<grid, block>>>(a, texX.data(), result.data(), len);
800  }
801  ProgramCU::CheckErrorCUDA("ComputeSAX");
802 }
803 
804 #define JACOBIAN_FRT_KWIDTH 64
805 
806 
807 
808 #ifndef PBA_DISABLE_CONST_CAMERA
809 #define JACOBIAN_SET_JC_BEGIN if (r3.w == 0.0f) {
810 #define JFRT_SET_JC_END \
811  } \
812  else { \
813  jc[jc_pos] = make_float4(0, 0, 0, 0); \
814  jc[jc_pos + 1] = make_float4(0, 0, 0, 0); \
815  jc[jc_pos + 2] = make_float4(0, 0, 0, 0); \
816  jc[jc_pos + 3] = make_float4(0, 0, 0, 0); \
817  }
818 #define JACOBIAN_SET_JC_END \
819  } \
820  else { \
821  jxc[0] = 0; \
822  jxc[1] = 0; \
823  jxc[2] = 0; \
824  jxc[3] = 0; \
825  jxc[4] = 0; \
826  jxc[5] = 0; \
827  jxc[6] = 0; \
828  jxc[7] = 0; \
829  jyc[0] = 0; \
830  jyc[1] = 0; \
831  jyc[2] = 0; \
832  jyc[3] = 0; \
833  jyc[4] = 0; \
834  jyc[5] = 0; \
835  jyc[6] = 0; \
836  jyc[7] = 0; \
837  }
838 #else
839 #define JACOBIAN_SET_JC_BEGIN
840 #define JFRT_SET_JC_END
841 #define JACOBIAN_SET_JC_END
842 #endif
843 
844 // projection model ei = K(RX + T) - (1 + r * m^2) * m
845 template <bool md, bool pd, bool scaling, bool shuffle>
846 __global__ void jacobian_frt_kernel(float4* jc, float4* jp, int nproj, int ptx,
847  int rowsz, float jic) {
848  ////////////////////////////////
849  int tidx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
850 
851  if (tidx >= nproj) return;
852  int2 proj = tex1Dfetch(tex_jacobian_idx, tidx);
853  int camera_pos = proj.x << 1;
854 
855  __shared__ float rr_data[JACOBIAN_FRT_KWIDTH * 9];
856  float* r = rr_data + IMUL(9, threadIdx.x);
857  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
858  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
859  r[0] = r1.x;
860  r[1] = r1.y;
861  r[2] = r1.z;
862  r[3] = r1.w;
863  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
864  r[4] = r2.x;
865  r[5] = r2.y;
866  r[6] = r2.z;
867  r[7] = r2.w;
868  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
869  r[8] = r3.x;
870 
871  float4 temp = tex1Dfetch(tex_jacobian_pts, proj.y);
872  float m[3];
873  m[0] = temp.x;
874  m[1] = temp.y;
875  m[2] = temp.z;
876 
877  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
878  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
879  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
880  float f_p2 = FDIV(ft.x, z0 + ft.w);
881  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
882  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
883 
884  // dp/dx = [f/p2 0 -f*p0/p2/p2]
885  // [0 f/p2 -f*p1/p2/p2]
886  // dx/dw = [ 0 z -y]
887  // [-z 0 x]
888  // [ y -x 0]
889  // R(dw) (x y z)' = (0 -z y)' dw0 + (z 0 -x)'dw1 + (-y x 0)'dw2
890  int jc_pos;
891  if (shuffle) {
892  jc_pos = tex1Dfetch(tex_jacobian_shuffle, tidx) << 2;
893  } else {
894  jc_pos = tidx << 2;
895  }
896 
897  if (pd) {
898  float rr1 = r3.y * p0_p2 * p0_p2;
899  float rr2 = r3.y * p1_p2 * p1_p2;
900  float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
901  float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
902  if (scaling == false) {
903  if (jc) {
904  JACOBIAN_SET_JC_BEGIN
905  // float jic = (r3.w != 1.0f && r3.w != 2.0f) ? 1.0f : 0.0f;
906  // float jec = (r3.w != 1.0f && r3.w != 3.0f) ? 1.0f : 0.0f;
907  float jfc = jic * (1 + rr1 + rr2);
908  float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
909  jc[jc_pos] = make_float4(p0_p2 * jfc, f_p2_x, 0, -f_p2_x * p0_p2);
910  jc[jc_pos + 1] =
911  make_float4(-f_p2_x * p0_p2 * y0, f_p2_x * (z0 + x0 * p0_p2),
912  -f_p2_x * y0, ft_x_pn * p0_p2);
913  jc[jc_pos + 2] = make_float4(p1_p2 * jfc, 0, f_p2_y, -f_p2 * p1_p2);
914  jc[jc_pos + 3] =
915  make_float4(-f_p2_y * (z0 + y0 * p1_p2), f_p2_y * x0 * p1_p2,
916  f_p2_y * x0, ft_x_pn * p1_p2);
917  JFRT_SET_JC_END
918  }
919  ////////////////////
920  jp[(tidx << 1)] = make_float4(f_p2_x * (r[0] - r[6] * p0_p2),
921  f_p2_x * (r[1] - r[7] * p0_p2),
922  f_p2_x * (r[2] - r[8] * p0_p2), 0);
923  jp[(tidx << 1) + 1] = make_float4(f_p2_y * (r[3] - r[6] * p1_p2),
924  f_p2_y * (r[4] - r[7] * p1_p2),
925  f_p2_y * (r[5] - r[8] * p1_p2), 0);
926  } else {
927  ////////////////////
928  if (jc) {
929  JACOBIAN_SET_JC_BEGIN
930  float jfc = jic * (1 + rr1 + rr2);
931  float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
932  float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
933  jc[jc_pos] = make_float4(p0_p2 * jfc * sc1.x, f_p2_x * sc1.y, 0,
934  -f_p2_x * p0_p2 * sc1.w);
935  jc[jc_pos + 2] = make_float4(p1_p2 * jfc * sc1.x, 0, f_p2_y * sc1.z,
936  -f_p2_y * p1_p2 * sc1.w);
937 
938  float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
939  jc[jc_pos + 1] = make_float4(
940  -sc2.x * f_p2_x * p0_p2 * y0, sc2.y * f_p2_x * (z0 + x0 * p0_p2),
941  -sc2.z * f_p2_x * y0, ft_x_pn * p0_p2 * sc2.w);
942  jc[jc_pos + 3] = make_float4(
943  -sc2.x * f_p2_y * (z0 + y0 * p1_p2), sc2.y * f_p2_y * x0 * p1_p2,
944  sc2.z * f_p2_y * x0, ft_x_pn * p1_p2 * sc2.w);
945  JFRT_SET_JC_END
946  }
947 
948  float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
949  jp[(tidx << 1)] = make_float4(sc3.x * f_p2_x * (r[0] - r[6] * p0_p2),
950  sc3.y * f_p2_x * (r[1] - r[7] * p0_p2),
951  sc3.z * f_p2_x * (r[2] - r[8] * p0_p2), 0);
952  jp[(tidx << 1) + 1] =
953  make_float4(sc3.x * f_p2_y * (r[3] - r[6] * p1_p2),
954  sc3.y * f_p2_y * (r[4] - r[7] * p1_p2),
955  sc3.z * f_p2_y * (r[5] - r[8] * p1_p2), 0);
956  }
957  } else if (md) {
958  if (scaling == false) {
959  if (jc) {
960  JACOBIAN_SET_JC_BEGIN
961  float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
962  float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
963  jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2);
964  jc[jc_pos + 1] =
965  make_float4(-f_p2 * p0_p2 * y0, f_p2 * (z0 + x0 * p0_p2),
966  -f_p2 * y0, -ms.x * msn);
967  jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2);
968  jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2),
969  f_p2 * x0 * p1_p2, f_p2 * x0, -ms.y * msn);
970  JFRT_SET_JC_END
971  }
972  ////////////////////
973  jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2),
974  f_p2 * (r[1] - r[7] * p0_p2),
975  f_p2 * (r[2] - r[8] * p0_p2), 0);
976  jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2),
977  f_p2 * (r[4] - r[7] * p1_p2),
978  f_p2 * (r[5] - r[8] * p1_p2), 0);
979  } else {
980  if (jc) {
981  JACOBIAN_SET_JC_BEGIN
982  float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
983  jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0,
984  -f_p2 * p0_p2 * sc1.w);
985  jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z,
986  -f_p2 * p1_p2 * sc1.w);
987 
988  float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
989  float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
990  float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
991  jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0,
992  sc2.y * f_p2 * (z0 + x0 * p0_p2),
993  -sc2.z * f_p2 * y0, -msn * ms.x * sc2.w);
994  jc[jc_pos + 3] = make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2),
995  sc2.y * f_p2 * x0 * p1_p2,
996  sc2.z * f_p2 * x0, -msn * ms.y * sc2.w);
997  JFRT_SET_JC_END
998  }
999  float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
1000  jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2),
1001  sc3.y * f_p2 * (r[1] - r[7] * p0_p2),
1002  sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0);
1003  jp[(tidx << 1) + 1] =
1004  make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2),
1005  sc3.y * f_p2 * (r[4] - r[7] * p1_p2),
1006  sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0);
1007  }
1008 
1009  } else {
1010  if (scaling == false) {
1011  if (jc) {
1012  JACOBIAN_SET_JC_BEGIN
1013  jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2);
1014  jc[jc_pos + 1] = make_float4(-f_p2 * p0_p2 * y0,
1015  f_p2 * (z0 + x0 * p0_p2), -f_p2 * y0, 0);
1016  jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2);
1017  jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2),
1018  f_p2 * x0 * p1_p2, f_p2 * x0, 0);
1019  JFRT_SET_JC_END
1020  }
1021  ////////////////////
1022  jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2),
1023  f_p2 * (r[1] - r[7] * p0_p2),
1024  f_p2 * (r[2] - r[8] * p0_p2), 0);
1025  jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2),
1026  f_p2 * (r[4] - r[7] * p1_p2),
1027  f_p2 * (r[5] - r[8] * p1_p2), 0);
1028  } else {
1029  if (jc) {
1030  JACOBIAN_SET_JC_BEGIN
1031  float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
1032  jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0,
1033  -f_p2 * p0_p2 * sc1.w);
1034  jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z,
1035  -f_p2 * p1_p2 * sc1.w);
1036  float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
1037  jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0,
1038  sc2.y * f_p2 * (z0 + x0 * p0_p2),
1039  -sc2.z * f_p2 * y0, 0);
1040  jc[jc_pos + 3] =
1041  make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2),
1042  sc2.y * f_p2 * x0 * p1_p2, sc2.z * f_p2 * x0, 0);
1043  JFRT_SET_JC_END
1044  }
1045 
1046  float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
1047  jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2),
1048  sc3.y * f_p2 * (r[1] - r[7] * p0_p2),
1049  sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0);
1050  jp[(tidx << 1) + 1] =
1051  make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2),
1052  sc3.y * f_p2 * (r[4] - r[7] * p1_p2),
1053  sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0);
1054  }
1055  }
1056 }
1057 
1058 /////////////////////////////////
1059 void ProgramCU::ComputeJacobian(CuTexImage& camera, CuTexImage& point,
1060  CuTexImage& jc, CuTexImage& jp,
1061  CuTexImage& proj_map, CuTexImage& sj,
1062  CuTexImage& meas, CuTexImage& cmlist,
1063  bool intrinsic_fixed, int radial_distortion,
1064  bool shuffle) {
1065  float jfc = intrinsic_fixed ? 0.0f : 1.0f;
1066  unsigned int len = proj_map.GetImgWidth();
1067  unsigned int bsize = JACOBIAN_FRT_KWIDTH;
1068  unsigned int nblock = (len + bsize - 1) / bsize;
1069  unsigned int bw, bh;
1070  GetBlockConfiguration(nblock, bw, bh);
1071  dim3 grid(bw, bh), block(bsize);
1072 
1073  PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1074  PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
1075  PBA_BIND_TEX1D(tex_jacobian_idx, proj_map, cudaReadModeElementType, PBA_ChanInt2());
1076 
1077  if (!jc.IsValid()) shuffle = false;
1078  if (shuffle) PBA_BIND_TEX1D(tex_jacobian_shuffle, cmlist, cudaReadModeElementType, PBA_ChanInt());
1079  if (sj.IsValid()) PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
1080 
1081  if (radial_distortion == -1) {
1082  PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
1083  if (sj.IsValid()) {
1084  if (shuffle)
1085  jacobian_frt_kernel<true, false, true, true><<<grid, block>>>(
1086  (float4*)jc.data(), (float4*)jp.data(), len,
1087  camera.GetImgWidth() * 2, bw * bsize, jfc);
1088  else
1089  jacobian_frt_kernel<true, false, true, false><<<grid, block>>>(
1090  (float4*)jc.data(), (float4*)jp.data(), len,
1091  camera.GetImgWidth() * 2, bw * bsize, jfc);
1092  } else {
1093  if (shuffle)
1094  jacobian_frt_kernel<true, false, false, true><<<grid, block>>>(
1095  (float4*)jc.data(), (float4*)jp.data(), len,
1096  camera.GetImgWidth() * 2, bw * bsize, jfc);
1097  else
1098  jacobian_frt_kernel<true, false, false, false><<<grid, block>>>(
1099  (float4*)jc.data(), (float4*)jp.data(), len,
1100  camera.GetImgWidth() * 2, bw * bsize, jfc);
1101  }
1102  } else if (radial_distortion) {
1103  if (sj.IsValid()) {
1104  if (shuffle)
1105  jacobian_frt_kernel<false, true, true, true><<<grid, block>>>(
1106  (float4*)jc.data(), (float4*)jp.data(), len,
1107  camera.GetImgWidth() * 2, bw * bsize, jfc);
1108  else
1109  jacobian_frt_kernel<false, true, true, false><<<grid, block>>>(
1110  (float4*)jc.data(), (float4*)jp.data(), len,
1111  camera.GetImgWidth() * 2, bw * bsize, jfc);
1112  } else {
1113  if (shuffle)
1114  jacobian_frt_kernel<false, true, false, true><<<grid, block>>>(
1115  (float4*)jc.data(), (float4*)jp.data(), len,
1116  camera.GetImgWidth() * 2, bw * bsize, jfc);
1117  else
1118  jacobian_frt_kernel<false, true, false, false><<<grid, block>>>(
1119  (float4*)jc.data(), (float4*)jp.data(), len,
1120  camera.GetImgWidth() * 2, bw * bsize, jfc);
1121  }
1122  } else {
1123  if (sj.IsValid()) {
1124  if (shuffle)
1125  jacobian_frt_kernel<false, false, true, true><<<grid, block>>>(
1126  (float4*)jc.data(), (float4*)jp.data(), len,
1127  camera.GetImgWidth() * 2, bw * bsize, jfc);
1128  else
1129  jacobian_frt_kernel<false, false, true, false><<<grid, block>>>(
1130  (float4*)jc.data(), (float4*)jp.data(), len,
1131  camera.GetImgWidth() * 2, bw * bsize, jfc);
1132  } else {
1133  if (shuffle)
1134  jacobian_frt_kernel<false, false, false, true><<<grid, block>>>(
1135  (float4*)jc.data(), (float4*)jp.data(), len,
1136  camera.GetImgWidth() * 2, bw * bsize, jfc);
1137  else
1138  jacobian_frt_kernel<false, false, false, false><<<grid, block>>>(
1139  (float4*)jc.data(), (float4*)jp.data(), len,
1140  camera.GetImgWidth() * 2, bw * bsize, jfc);
1141  }
1142  }
1143 
1144  ProgramCU::CheckErrorCUDA("ComputeJacobian");
1145 }
1146 
1147 
1148 __global__ void uncompress_frt_kernel(int ncam, float4* ucam) {
1149  int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1150  if (tidx >= ncam) return;
1151  int fetch_index = tidx << 1;
1152  int write_index = IMUL(tidx, 4);
1153  float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index);
1154  ucam[write_index] = temp1;
1155 
1156  float4 temp2 = tex1Dfetch(tex_compact_cam, fetch_index + 1);
1157  float rx = temp2.x;
1158  float ry = temp2.y;
1159  float rz = temp2.z;
1160  float rx_rx = rx * rx;
1161  float ry_ry = ry * ry;
1162  float rz_rz = rz * rz;
1163  float aa = sqrt(rx_rx + ry_ry + rz_rz);
1164  float caa, saa;
1165  sincosf(aa, &saa, &caa);
1166  float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa);
1167  float st = aa == 0.0 ? 1 : FDIV2(saa, aa);
1168  float rz_st = rz * st;
1169  float rx_st = rx * st;
1170  float ry_st = ry * st;
1171  float ry_ry_ct = ry_ry * ct;
1172  float rx_rx_ct = rx_rx * ct;
1173  float rz_rz_ct = rz_rz * ct;
1174  float rx_ry_ct = rx * ry * ct;
1175  float rz_rx_ct = rz * rx * ct;
1176  float ry_rz_ct = ry * rz * ct;
1177 
1178  ////////////////////////////////////////////////////////////
1179  ucam[write_index + 1] =
1180  make_float4((1.0 - (ry_ry_ct + rz_rz_ct)), (rx_ry_ct - rz_st),
1181  (rz_rx_ct + ry_st), (rx_ry_ct + rz_st));
1182 
1183  ucam[write_index + 2] =
1184  make_float4((1.0 - (rz_rz_ct + rx_rx_ct)), (ry_rz_ct - rx_st),
1185  (rz_rx_ct - ry_st), (ry_rz_ct + rx_st));
1186 
1187  ucam[write_index + 3] =
1188  make_float4((1.0 - (rx_rx_ct + ry_ry_ct)), temp2.w, 0, 0);
1189 }
1190 
1191 void ProgramCU::UncompressCamera(int ncam, CuTexImage& camera,
1192  CuTexImage& result) {
1193  unsigned int len = ncam;
1194  unsigned int bsize = 64;
1195  unsigned int nblock = (len + bsize - 1) / bsize;
1196  dim3 grid(nblock);
1197  dim3 block(bsize);
1198  PBA_BIND_TEX1D(tex_compact_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1199  uncompress_frt_kernel<<<grid, block>>>(len, (float4*)result.data());
1200  CheckErrorCUDA("UncompressCamera");
1201 }
1202 
1203 
1204 
1205 __global__ void compress_frt_kernel(int ncam, float4* zcam) {
1206  int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1207  if (tidx >= ncam) return;
1208  int fetch_index = tidx << 2;
1209  int write_index = tidx << 1;
1210  float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index);
1211  zcam[write_index] = temp1;
1212 
1213  float4 r1 = tex1Dfetch(tex_compact_cam, fetch_index + 1);
1214  float4 r2 = tex1Dfetch(tex_compact_cam, fetch_index + 2);
1215  float4 r3 = tex1Dfetch(tex_compact_cam, fetch_index + 3);
1216 
1217  float a = (r1.x + r2.x + r3.x - 1.0) / 2.0;
1218  if (a >= 1.0) {
1219  zcam[write_index + 1] = make_float4(0, 0, 0, 0);
1220  } else {
1221  float aa = acos(a), b = 0.5 * aa * rsqrt(1 - a * a);
1222  zcam[write_index + 1] = make_float4(b * (r2.w - r2.y), b * (r1.z - r2.z),
1223  b * (r1.w - r1.y), r3.y);
1224  }
1225 }
1226 
1227 void ProgramCU::CompressCamera(int ncam, CuTexImage& camera0,
1228  CuTexImage& result) {
1229  unsigned int len = ncam;
1230  unsigned int bsize = 64;
1231  unsigned int nblock = (len + bsize - 1) / bsize;
1232  dim3 grid(nblock), block(bsize);
1233  PBA_BIND_TEX1D(tex_uncompressed_cam, camera0, cudaReadModeElementType, PBA_ChanFloat4());
1234  compress_frt_kernel<<<grid, block>>>(ncam, (float4*)result.data());
1235  CheckErrorCUDA("CompressCamera");
1236 }
1237 
1238 __device__ inline void uncompress_rodrigues_rotation(float rx, float ry,
1239  float rz, float* r) {
1240  float rx_rx = rx * rx;
1241  float ry_ry = ry * ry;
1242  float rz_rz = rz * rz;
1243  float aa = sqrt(rx_rx + ry_ry + rz_rz);
1244  float caa, saa;
1245  sincosf(aa, &saa, &caa);
1246  float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa);
1247  float st = aa == 0.0 ? 1 : FDIV2(saa, aa);
1248  float rz_st = rz * st;
1249  float rx_st = rx * st;
1250  float ry_st = ry * st;
1251  float ry_ry_ct = ry_ry * ct;
1252  float rx_rx_ct = rx_rx * ct;
1253  float rz_rz_ct = rz_rz * ct;
1254  float rx_ry_ct = rx * ry * ct;
1255  float rz_rx_ct = rz * rx * ct;
1256  float ry_rz_ct = ry * rz * ct;
1257  r[0] = (1.0 - (ry_ry_ct + rz_rz_ct));
1258  r[1] = (rx_ry_ct - rz_st);
1259  r[2] = (rz_rx_ct + ry_st);
1260  r[3] = (rx_ry_ct + rz_st);
1261  r[4] = (1.0 - (rz_rz_ct + rx_rx_ct));
1262  r[5] = (ry_rz_ct - rx_st);
1263  r[6] = (rz_rx_ct - ry_st);
1264  r[7] = (ry_rz_ct + rx_st);
1265  r[8] = (1.0 - (rx_rx_ct + ry_ry_ct));
1266 }
1267 
1268 
1269 
1270 __global__ void update_camera_kernel(int ncam, float4* newcam) {
1271  int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1272  if (tidx >= ncam) return;
1273  int index0 = tidx << 2;
1274  int index1 = tidx << 1;
1275  {
1276  float4 c1 = tex1Dfetch(tex_update_cam, index0);
1277  float4 d1 = tex1Dfetch(tex_update_cam_delta, index1);
1278  float4 c2 = make_float4(max(c1.x + d1.x, 1e-10f), c1.y + d1.y, c1.z + d1.z,
1279  c1.w + d1.w);
1280  newcam[index0] = c2;
1281  }
1282  {
1283  float r[9], dr[9]; //, nr[9];
1284  float4 r1 = tex1Dfetch(tex_update_cam, index0 + 1);
1285  r[0] = r1.x;
1286  r[1] = r1.y;
1287  r[2] = r1.z;
1288  r[3] = r1.w;
1289  float4 r2 = tex1Dfetch(tex_update_cam, index0 + 2);
1290  r[4] = r2.x;
1291  r[5] = r2.y;
1292  r[6] = r2.z;
1293  r[7] = r2.w;
1294  float4 r3 = tex1Dfetch(tex_update_cam, index0 + 3);
1295  r[8] = r3.x;
1296 
1297  float4 dd = tex1Dfetch(tex_update_cam_delta, index1 + 1);
1298  uncompress_rodrigues_rotation(dd.x, dd.y, dd.z, dr);
1299 
1300  ///////////////////////////////////////////////
1301  newcam[index0 + 1] =
1302  make_float4(dr[0] * r[0] + dr[1] * r[3] + dr[2] * r[6],
1303  dr[0] * r[1] + dr[1] * r[4] + dr[2] * r[7],
1304  dr[0] * r[2] + dr[1] * r[5] + dr[2] * r[8],
1305  dr[3] * r[0] + dr[4] * r[3] + dr[5] * r[6]);
1306  newcam[index0 + 2] =
1307  make_float4(dr[3] * r[1] + dr[4] * r[4] + dr[5] * r[7],
1308  dr[3] * r[2] + dr[4] * r[5] + dr[5] * r[8],
1309  dr[6] * r[0] + dr[7] * r[3] + dr[8] * r[6],
1310  dr[6] * r[1] + dr[7] * r[4] + dr[8] * r[7]);
1311  newcam[index0 + 3] = make_float4(dr[6] * r[2] + dr[7] * r[5] + dr[8] * r[8],
1312  r3.y + dd.w, r3.z, r3.w);
1313  }
1314 }
1315 
1316 void ProgramCU::UpdateCameraPoint(int ncam, CuTexImage& camera,
1317  CuTexImage& point, CuTexImage& delta,
1318  CuTexImage& new_camera, CuTexImage& new_point,
1319  int mode) {
1320  if (mode != 2) {
1321  unsigned int len = ncam;
1322  unsigned int bsize = 64;
1323  unsigned int nblock = (len + bsize - 1) / bsize;
1324  dim3 grid(nblock), block(bsize);
1325  PBA_BIND_TEX1D(tex_update_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1326  PBA_BIND_TEX1D(tex_update_cam_delta, delta, cudaReadModeElementType, PBA_ChanFloat4());
1327  update_camera_kernel<<<grid, block>>>(len, (float4*)new_camera.data());
1328  CheckErrorCUDA("UpdateCamera");
1329  }
1330 
1331  // update the points
1332  if (mode != 1) {
1333  CuTexImage dp;
1334  dp.SetTexture(delta.data() + 8 * ncam, point.GetLength());
1335  ComputeSAXPY(1.0f, dp, point, new_point);
1336  CheckErrorCUDA("UpdatePoint");
1337  }
1338 }
1339 
1340 #define PROJECTION_FRT_KWIDTH 64
1341 
1342 
1343 
1344 // run 32/64/128 projections in a block
1345 template <bool md, bool pd>
1346 __global__ void projection_frt_kernel(int nproj, int rowsz, float2* pj) {
1347  ////////////////////////////////
1348  int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
1349  if (tidx >= nproj) return;
1350  float f, m[3], t[3]; // r[9],
1351  __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9];
1352  float* r = rr_data + IMUL(9, threadIdx.x);
1353  int2 proj = tex1Dfetch(tex_projection_idx, tidx);
1354  int cpos = proj.x << 1;
1355  float4 ft = tex1Dfetch(tex_projection_cam, cpos);
1356  f = ft.x;
1357  t[0] = ft.y;
1358  t[1] = ft.z;
1359  t[2] = ft.w;
1360  float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1);
1361  r[0] = r1.x;
1362  r[1] = r1.y;
1363  r[2] = r1.z;
1364  r[3] = r1.w;
1365  float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2);
1366  r[4] = r2.x;
1367  r[5] = r2.y;
1368  r[6] = r2.z;
1369  r[7] = r2.w;
1370  float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3);
1371  r[8] = r3.x;
1372 
1373  float4 temp = tex1Dfetch(tex_projection_pts, proj.y);
1374  m[0] = temp.x;
1375  m[1] = temp.y;
1376  m[2] = temp.z;
1377 
1378  float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0];
1379  float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1];
1380  float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2];
1381 
1382  if (pd) {
1383  float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2);
1384  float f_p2 = FDIV2(f * rr, p2);
1385  float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1386  pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
1387  } else if (md) {
1388  float f_p2 = FDIV2(f, p2);
1389  float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1390  float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y);
1391  pj[tidx] = make_float2(ms.x * rd - p0 * f_p2, ms.y * rd - p1 * f_p2);
1392  } else {
1393  float f_p2 = FDIV2(f, p2);
1394  float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1395  pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
1396  }
1397 }
1398 
1399 void ProgramCU::ComputeProjection(CuTexImage& camera, CuTexImage& point,
1400  CuTexImage& meas, CuTexImage& proj_map,
1401  CuTexImage& proj, int radial) {
1402  unsigned int len = proj_map.GetImgWidth();
1403  unsigned int bsize = PROJECTION_FRT_KWIDTH;
1404  unsigned int nblock = (len + bsize - 1) / bsize;
1405  PBA_BIND_TEX1D(tex_projection_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1406  PBA_BIND_TEX1D(tex_projection_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
1407  PBA_BIND_TEX1D(tex_projection_idx, proj_map, cudaReadModeElementType, PBA_ChanInt2());
1408  unsigned int bw, bh;
1409  GetBlockConfiguration(nblock, bw, bh);
1410  dim3 grid(bw, bh), block(bsize);
1411  PBA_BIND_TEX1D(tex_projection_mea, meas, cudaReadModeElementType, PBA_ChanFloat2());
1412  if (radial == -1)
1413  projection_frt_kernel<true, false><<<grid, block>>>(len, bw * bsize,
1414  (float2*)proj.data());
1415  else if (radial)
1416  projection_frt_kernel<false, true><<<grid, block>>>(len, bw * bsize,
1417  (float2*)proj.data());
1418  else
1419  projection_frt_kernel<false, false><<<grid, block>>>(len, bw * bsize,
1420  (float2*)proj.data());
1421  CheckErrorCUDA("ComputeProjection");
1422 }
1423 
1424 template <bool md, bool pd>
1425 __global__ void projectionx_frt_kernel(int nproj, int rowsz, float2* pj) {
1426  ////////////////////////////////
1427  int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
1428  if (tidx >= nproj) return;
1429  float f, m[3], t[3]; // r[9],
1430  __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9];
1431  float* r = rr_data + IMUL(9, threadIdx.x);
1432  int2 proj = tex1Dfetch(tex_projection_idx, tidx);
1433  int cpos = proj.x << 1;
1434  float4 ft = tex1Dfetch(tex_projection_cam, cpos);
1435  f = ft.x;
1436  t[0] = ft.y;
1437  t[1] = ft.z;
1438  t[2] = ft.w;
1439  float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1);
1440  r[0] = r1.x;
1441  r[1] = r1.y;
1442  r[2] = r1.z;
1443  r[3] = r1.w;
1444  float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2);
1445  r[4] = r2.x;
1446  r[5] = r2.y;
1447  r[6] = r2.z;
1448  r[7] = r2.w;
1449  float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3);
1450  r[8] = r3.x;
1451 
1452  float4 temp = tex1Dfetch(tex_projection_pts, proj.y);
1453  m[0] = temp.x;
1454  m[1] = temp.y;
1455  m[2] = temp.z;
1456 
1457  float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0];
1458  float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1];
1459  float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2];
1460  if (pd) {
1461  float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2);
1462  float f_p2 = FDIV2(f, p2);
1463  float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1464  pj[tidx] = make_float2(ms.x / rr - p0 * f_p2, ms.y / rr - p1 * f_p2);
1465  } else if (md) {
1466  float f_p2 = FDIV2(f, p2);
1467  float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1468  float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y);
1469  pj[tidx] = make_float2(ms.x - p0 * f_p2 / rd, ms.y - p1 * f_p2 / rd);
1470  } else {
1471  float f_p2 = FDIV2(f, p2);
1472  float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1473  pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
1474  }
1475 }
1476 
1477 void ProgramCU::ComputeProjectionX(CuTexImage& camera, CuTexImage& point,
1478  CuTexImage& meas, CuTexImage& proj_map,
1479  CuTexImage& proj, int radial) {
1480  unsigned int len = proj_map.GetImgWidth();
1481  unsigned int bsize = PROJECTION_FRT_KWIDTH;
1482  unsigned int nblock = (len + bsize - 1) / bsize;
1483  PBA_BIND_TEX1D(tex_projection_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1484  PBA_BIND_TEX1D(tex_projection_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
1485  PBA_BIND_TEX1D(tex_projection_idx, proj_map, cudaReadModeElementType, PBA_ChanInt2());
1486  unsigned int bw, bh;
1487  GetBlockConfiguration(nblock, bw, bh);
1488  dim3 grid(bw, bh), block(bsize);
1489  PBA_BIND_TEX1D(tex_projection_mea, meas, cudaReadModeElementType, PBA_ChanFloat2());
1490  if (radial == -1)
1491  projectionx_frt_kernel<true, false><<<grid, block>>>(len, bw * bsize,
1492  (float2*)proj.data());
1493  else if (radial)
1494  projectionx_frt_kernel<false, true><<<grid, block>>>(len, bw * bsize,
1495  (float2*)proj.data());
1496  else
1497  projectionx_frt_kernel<false, false><<<grid, block>>>(len, bw * bsize,
1498  (float2*)proj.data());
1499  CheckErrorCUDA("ComputeProjection");
1500 }
1501 
1502 
1503 
1504 __global__ void jte_cam_kernel(int num, float* jc, float* jte) {
1505  __shared__ float value[128];
1506 
1507  // 8thread per camera
1508  int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1509  if (col >= num) return;
1510 
1511  int cam = col >> 4; // 8 thread per camera
1512 
1513  // read data range for this camera, 8 thread will do the same thing
1514  int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4; // first camera
1515  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4; // last camera + 1
1516 
1517  ///////////////////////////////
1518  int offset = threadIdx.x & 0xf; // which parameter of this camera
1519  int part = offset >= 8 ? 1 : 0;
1520  /////////////////////////////
1521 
1522  float result = 0;
1523  // loop to read the index of the projection.
1524  // so to get the location to read the jacobian
1525  for (int i = idx1 + offset; i < idx2; i += 16) {
1526  float temp = jc[i];
1527  // every 8 thread will read the same position.
1528  int index = tex1Dfetch(tex_jte_cmt, i >> 4);
1529  float v = tex1Dfetch(tex_jte_pex, (index << 1) + part);
1530  //////////////////////
1531  result += temp * v;
1532  }
1533  value[threadIdx.x] = result;
1534  // write back
1535  if (offset < 8) jte[(cam << 3) + offset] = (result + value[threadIdx.x + 8]);
1536 }
1537 
1538 template <int KH, int TEXN>
1539 __global__ void jte_cam_vec_kernel(int num, float* jte) {
1540  __shared__ float value[KH * 128];
1541  int cam = blockIdx.x * KH + threadIdx.y;
1542  if (cam >= num) return;
1543 
1544  // read data range for this camera
1545  // 8 thread will do the same thing
1546  int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 2; // first camera
1547  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 2; // last camera + 1
1548  int part = (threadIdx.x & 0x02) ? 1 : 0;
1549 
1550  float rx = 0, ry = 0, rz = 0, rw = 0;
1551  // loop to read the index of the projection.
1552  // so to get the location to read the jacobian
1553  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1554  float4 temp;
1555  if (TEXN == 1) {
1556  temp = tex1Dfetch(tex_jte_jc, i);
1557  }
1558  if (TEXN == 2) {
1559  int texid = i >> 25;
1560  if (texid == 0)
1561  temp = tex1Dfetch(tex_jte_jc, i);
1562  else
1563  temp = tex1Dfetch(tex_jte_jc2, (i & 0x1ffffff));
1564  }
1565  if (TEXN == 4) {
1566  int index = tex1Dfetch(tex_jte_cmt, i >> 2);
1567  int iii = (index << 2) + (i & 0x3);
1568  int texid = iii >> 25;
1569  /////////////////////////////////
1570  if (texid == 0)
1571  temp = tex1Dfetch(tex_jte_jc, iii);
1572  else if (texid == 1)
1573  temp = tex1Dfetch(tex_jte_jc2, (iii & 0x1ffffff));
1574  else if (texid == 2)
1575  temp = tex1Dfetch(tex_jte_jc3, (iii & 0x1ffffff));
1576  else
1577  temp = tex1Dfetch(tex_jte_jc4, (iii & 0x1ffffff));
1578  }
1579  int index = tex1Dfetch(tex_jte_cmt, i >> 2);
1580  float vv = tex1Dfetch(tex_jte_pex, (index << 1) + part);
1581  rx += temp.x * vv;
1582  ry += temp.y * vv;
1583  rz += temp.z * vv;
1584  rw += temp.w * vv;
1585  }
1586  ////////////////////////////////////
1587  int widx = (threadIdx.y << 7) + (threadIdx.x << 2);
1588  ///////////////////////////////////
1589  // write back
1590  value[widx] = rx;
1591  value[widx + 1] = ry;
1592  value[widx + 2] = rz;
1593  value[widx + 3] = rw;
1594  ////////////////////////////////////
1595  int ridx = (threadIdx.y << 7) + threadIdx.x;
1596  value[ridx] = ((value[ridx] + value[ridx + 32]) +
1597  (value[ridx + 64] + value[ridx + 96]));
1598  if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
1599  if (threadIdx.x < 8)
1600  jte[(cam << 3) + threadIdx.x] = value[ridx] + value[ridx + 8];
1601 }
1602 
1603 template <int KH, bool JT>
1604 __global__ void jte_cam_vec32_kernel(int num, float* jc, float* jte) {
1605  __shared__ float value[KH * 32];
1606  int cam = blockIdx.x * KH + threadIdx.y;
1607  if (cam >= num) return;
1608  float sum = 0;
1609  int rowpos = (threadIdx.y << 5);
1610  int index = threadIdx.x + rowpos;
1611  int xypart = (threadIdx.x & 0x08) ? 1 : 0;
1612  int part2 = threadIdx.x & 0xf;
1613  // read data range for this camera
1614  // 8 thread will do the same thing
1615  int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4; // first camera
1616  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4; // last camera + 1
1617 
1618  // loop to read the index of the projection.
1619  // so to get the location to read the jacobian
1620  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1621  int index = tex1Dfetch(tex_jte_cmt, i >> 4);
1622  float temp;
1623  if (JT)
1624  temp = jc[i];
1625  else
1626  temp = jc[(index << 4) + part2];
1627 
1628  float v = tex1Dfetch(tex_jte_pex, (index << 1) + xypart);
1629  sum += temp * v;
1630  }
1631  value[index] = sum;
1632 
1633  if (threadIdx.x < 16) value[index] += value[index + 16];
1634  if (threadIdx.x < 8)
1635  jte[(cam << 3) + threadIdx.x] = value[index] + value[index + 8];
1636 }
1637 
1638 /////////////////////////////////////////////////////////////
1639 
1640 
1641 __global__ void jte_point_kernel(int num, float4* jte) {
1642  ////////////////////////////
1643  int index = blockIdx.x * blockDim.x + threadIdx.x;
1644  if (index >= num) return;
1645 
1646  int idx1 = tex1Dfetch(tex_jte_pmp, index); // first camera
1647  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last camera + 1
1648  float4 result = make_float4(0, 0, 0, 0);
1649  for (int i = idx1; i < idx2; ++i) {
1650  // error vector
1651  float2 ev = tex1Dfetch(tex_jte_pe, i);
1652 
1653  float4 j1 = tex1Dfetch(tex_jte_jp, i << 1);
1654  result.x += j1.x * ev.x;
1655  result.y += j1.y * ev.x;
1656  result.z += j1.z * ev.x;
1657 
1658  float4 j2 = tex1Dfetch(tex_jte_jp, 1 + (i << 1));
1659  result.x += j2.x * ev.y;
1660  result.y += j2.y * ev.y;
1661  result.z += j2.z * ev.y;
1662  }
1663  jte[index] = result;
1664 }
1665 
1666 ////////////////////
1667 // faster but not always more accurate
1668 //#define JTE_POINT_VEC2
1669 
1670 template <int KH, int TEXN>
1671 __global__ void jte_point_vec_kernel(int num, int rowsz, float* jte) {
1672  ////////////////////////////
1673  __shared__ float value[KH * 128];
1674  int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
1675  if (index >= num) return;
1676 #ifdef JTE_POINT_VEC2
1677  int idx1 = tex1Dfetch(tex_jte_pmp, index); // first
1678  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last + 1
1679 #else
1680  int idx1 = tex1Dfetch(tex_jte_pmp, index) << 1; // first
1681  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1) << 1; // last + 1
1682 #endif
1683  float rx = 0, ry = 0, rz = 0;
1684  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1685  if (TEXN == 2 && i >> 25) {
1686 #ifdef JTE_POINT_VEC2
1687 
1688  float2 vv = tex1Dfetch(tex_jte_pe, i);
1689  float4 jp1 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1));
1690  float4 jp2 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1) + 1);
1691  rx += (jp1.x * vv.x + jp2.x * vv.y);
1692  ry += (jp1.y * vv.x + jp2.y * vv.y);
1693  rz += (jp1.z * vv.x + jp2.z * vv.y);
1694 #else
1695  float vv = tex1Dfetch(tex_jte_pex, i);
1696  float4 jpi = tex1Dfetch(tex_jte_jp2, i & 0x1ffffff);
1697  rx += jpi.x * vv;
1698  ry += jpi.y * vv;
1699  rz += jpi.z * vv;
1700 #endif
1701  } else {
1702 #ifdef JTE_POINT_VEC2
1703  float2 vv = tex1Dfetch(tex_jte_pe, i);
1704  float4 jp1 = tex1Dfetch(tex_jte_jp, (i << 1));
1705  float4 jp2 = tex1Dfetch(tex_jte_jp, (i << 1) + 1);
1706  rx += (jp1.x * vv.x + jp2.x * vv.y);
1707  ry += (jp1.y * vv.x + jp2.y * vv.y);
1708  rz += (jp1.z * vv.x + jp2.z * vv.y);
1709 #else
1710  float vv = tex1Dfetch(tex_jte_pex, i);
1711  float4 jpi = tex1Dfetch(tex_jte_jp, i);
1712  rx += jpi.x * vv;
1713  ry += jpi.y * vv;
1714  rz += jpi.z * vv;
1715 #endif
1716  }
1717  }
1718 
1719  int rowp = threadIdx.y << 7;
1720  int loc = (threadIdx.x << 2) + rowp;
1721  value[loc] = rx;
1722  value[loc + 1] = ry;
1723  value[loc + 2] = rz;
1724  value[loc + 3] = 0;
1725 
1726  int ridx = threadIdx.x + rowp;
1727  value[ridx] = ((value[ridx] + value[ridx + 32]) +
1728  (value[ridx + 64] + value[ridx + 96]));
1729  if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
1730  if (threadIdx.x < 8) value[ridx] += value[ridx + 8];
1731  if (threadIdx.x < 4)
1732  jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4];
1733 }
1734 
1735 #define JTE_CAMERA_VEC
1736 #define JTE_POINT_VEC
1737 
1738 void ProgramCU::ComputeJtE(CuTexImage& pe, CuTexImage& jc, CuTexImage& cmap,
1739  CuTexImage& cmlist, CuTexImage& jp, CuTexImage& pmap,
1740  CuTexImage& jte, bool jc_transpose, int mode) {
1741  //////////////////////////////////////////////////////////
1742  int ncam = int(cmap.GetImgWidth() - 1); // how many cameras
1743  size_t szjc = jc.GetDataSize();
1744 
1745  //////////////////////////////
1746  PBA_BIND_TEX1D(tex_jte_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
1747  PBA_BIND_TEX1D(tex_jte_cmt, cmlist, cudaReadModeElementType, PBA_ChanInt());
1748 #ifdef JTE_CAMERA_VEC2
1749  PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1750  const unsigned int bheight = 2;
1751  dim3 block1(32, bheight), grid1((ncam + bheight - 1) / bheight);
1752  if (mode == 2) {
1753  } else if (jc_transpose)
1754  jte_cam_vec32_kernel<bheight, true><<<grid1, block1>>>(ncam, jc.data(),
1755  jte.data());
1756  else
1757  jte_cam_vec32_kernel<bheight, false><<<grid1, block1>>>(ncam, jc.data(),
1758  jte.data());
1759 
1760 #elif defined(JTE_CAMERA_VEC)
1761  PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1762  const unsigned int bheight = 2;
1763  unsigned int len1 = ncam * 32;
1764  unsigned int bsize1 = 32 * bheight;
1765  unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
1766  dim3 grid1(nblock1);
1767  dim3 block1(32, bheight);
1768  if (mode == 2) {
1769  // skip camera
1770  } else if (szjc > 2 * MAX_TEXSIZE || !jc_transpose) {
1771  if (jc_transpose)
1772  jte_cam_vec32_kernel<bheight, true><<<grid1, block1>>>(ncam, jc.data(),
1773  jte.data());
1774  else
1775  jte_cam_vec32_kernel<bheight, false><<<grid1, block1>>>(ncam, jc.data(),
1776  jte.data());
1777  } else if (szjc > MAX_TEXSIZE) {
1778  PBA_BIND_TEX1D_2(tex_jte_jc, tex_jte_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
1779  jte_cam_vec_kernel<bheight, 2><<<grid1, block1>>>(ncam, jte.data());
1780  } else {
1781  PBA_BIND_TEX1D(tex_jte_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
1782  jte_cam_vec_kernel<bheight, 1><<<grid1, block1>>>(ncam, jte.data());
1783  }
1784 #else
1785  PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1786  unsigned int len1 = ncam * 16;
1787  unsigned int bsize1 = len1 > 32 * 128 ? 128 : (len1 > 32 * 64 ? 64 : 32);
1788  unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
1789  dim3 grid1(nblock1), block1(bsize1);
1790  jte_cam_kernel<<<grid1, block1>>>(len1, jc.data(), jte.data());
1791 #endif
1792  CheckErrorCUDA("ComputeJtE<Camera>");
1793 
1794  ////////////////////////////////////////////
1795  PBA_BIND_TEX1D(tex_jte_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
1796  unsigned int npoint = (pmap.GetImgWidth() - 1);
1797 #ifndef JTE_POINT_VEC
1798  size_t len2 = npoint;
1799  unsigned int bsize2 = 64;
1800  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
1801  dim3 grid2(nblock2), block2(bsize2);
1802  PBA_BIND_TEX1D(tex_jte_pe, pe, cudaReadModeElementType, PBA_ChanFloat2());
1803  PBA_BIND_TEX1D(tex_jte_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
1804  jte_point_kernel<<<grid2, block2>>>(len2, ((float4*)jte.data()) + 2 * ncam);
1805 #else
1806 
1807 #ifdef JTE_POINT_VEC2
1808  PBA_BIND_TEX1D(tex_jte_pe, pe, cudaReadModeElementType, PBA_ChanFloat2());
1809 #else
1810  PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1811 #endif
1812  const unsigned int bheight2 = 2;
1813  unsigned int bsize2 = 32;
1814  unsigned int nblock2 = (unsigned int)((npoint + bheight2 - 1) / bheight2);
1815  unsigned int offsetv = 8 * ncam;
1816  unsigned int bw, bh;
1817  GetBlockConfiguration(nblock2, bw, bh);
1818  dim3 grid2(bw, bh), block2(bsize2, bheight2);
1819  if (mode == 1) {
1820  // skip point
1821  } else if (jp.GetDataSize() > MAX_TEXSIZE) {
1822  PBA_BIND_TEX1D_2(tex_jte_jp, tex_jte_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
1823  jte_point_vec_kernel<bheight2, 2><<<grid2, block2>>>(
1824  npoint, bw * bheight2, ((float*)jte.data()) + offsetv);
1825  } else {
1826  PBA_BIND_TEX1D(tex_jte_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
1827  jte_point_vec_kernel<bheight2, 1><<<grid2, block2>>>(
1828  npoint, bw * bheight2, ((float*)jte.data()) + offsetv);
1829  }
1830 #endif
1831  CheckErrorCUDA("ComputeJtE<Point>");
1832 }
1833 
1834 
1835 
1836 template <int VN, int KH, bool JT>
1837 __global__ void jtjd_cam_vec32_kernel(int num, int add_existing_dq, float* jc,
1838  float* jtjd, float* jtjdi) {
1839  __shared__ float value[KH * 32];
1840 
1841  // 8thread per camera
1842  int cam = blockIdx.x * KH + threadIdx.y;
1843  int part = threadIdx.x & 0x7; // which parameter of this camera
1844  int part2 = threadIdx.x & 0xf;
1845  int campos = threadIdx.y << 5;
1846  int index = threadIdx.x + campos;
1847  float sum = 0;
1848  if (cam < num && part < VN) {
1849  // read data range for this camera
1850  // 8 thread will do the same thing
1851  int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4; // first camera
1852  int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4; // last camera + 1
1853 
1854  // loop to read the index of the projection.
1855  // so to get the location to read the jacobian
1856  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1857  if (JT) {
1858  float temp = jc[i];
1859  sum += temp * temp;
1860  } else {
1861  int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4;
1862  float temp = jc[ii + part2];
1863  sum += temp * temp;
1864  }
1865  }
1866  }
1867  __syncthreads();
1868 
1869  if (cam >= num) return;
1870  // save all the results?
1871  value[index] = sum;
1872  if (threadIdx.x < 16) value[index] += value[index + 16];
1873  if (threadIdx.x < 8)
1874 
1875  // write back
1876  if (threadIdx.x < 8) {
1877  float temp = value[index] + value[index + 8];
1878  int wpos = threadIdx.x + (cam << 3);
1879  if (add_existing_dq) temp += jtjd[wpos];
1880  jtjd[wpos] = temp;
1881  jtjdi[wpos] = temp == 0 ? 0 : 1 / (temp);
1882  }
1883 }
1884 
1885 
1886 
1887 #define JTJD_POINT_KWIDTH 64
1888 
1889 template <int TEXN>
1890 __global__ void jtjd_point_kernel(int num, int rowsz, float4* jtjd,
1891  float4* jtjdi) {
1892  ////////////////////////////
1893  int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
1894  if (index >= num) return;
1895 
1896  int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first camera
1897  int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last camera + 1
1898  float rx = 0, ry = 0, rz = 0;
1899  for (int i = idx1; i < idx2; ++i) {
1900  if (TEXN == 2 && i > 0xffffff) {
1901  float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1);
1902  rx += j1.x * j1.x;
1903  ry += j1.y * j1.y;
1904  rz += j1.z * j1.z;
1905 
1906  float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1));
1907  rx += j2.x * j2.x;
1908  ry += j2.y * j2.y;
1909  rz += j2.z * j2.z;
1910  } else {
1911  float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1);
1912  rx += j1.x * j1.x;
1913  ry += j1.y * j1.y;
1914  rz += j1.z * j1.z;
1915 
1916  float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1));
1917  rx += j2.x * j2.x;
1918  ry += j2.y * j2.y;
1919  rz += j2.z * j2.z;
1920  }
1921  }
1922 
1923  if (jtjd) jtjd[index] = make_float4(rx, ry, rz, 0.0f);
1924  jtjdi[index] = make_float4(1.0f / rx, 1.0f / ry, 1.0f / rz, 0.0f);
1925 }
1926 
1927 void ProgramCU::ComputeDiagonal(CuTexImage& jc, CuTexImage& cmap,
1928  CuTexImage& jp, CuTexImage& pmap,
1929  CuTexImage& cmlist, CuTexImage& jtjd,
1930  CuTexImage& jtjdi, bool jc_transpose,
1931  int radial, bool add_existing_diagc) {
1932  //////////////////////////////////////////////////////////
1933  size_t szjc = jc.GetDataSize();
1934  unsigned int ncam = (cmap.GetImgWidth() - 1); // how many cameras
1935 
1936  const unsigned int bheight = 2;
1937  dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight);
1938  PBA_BIND_TEX1D(tex_jtjd_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
1939  if (jc_transpose) {
1940  if (radial)
1941  jtjd_cam_vec32_kernel<8, bheight, true><<<grid1x, block1x>>>(
1942  ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1943  else
1944  jtjd_cam_vec32_kernel<7, bheight, true><<<grid1x, block1x>>>(
1945  ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1946  } else {
1947  PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
1948  if (radial)
1949  jtjd_cam_vec32_kernel<8, bheight, false><<<grid1x, block1x>>>(
1950  ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1951  else
1952  jtjd_cam_vec32_kernel<7, bheight, false><<<grid1x, block1x>>>(
1953  ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1954  }
1955  CheckErrorCUDA("ComputeDiagonal<Camera>");
1956 
1957  ////////////////////////////////////////////
1958  unsigned int npoint = (pmap.GetImgWidth() - 1);
1959  unsigned int len2 = npoint;
1960  unsigned int bsize2 = JTJD_POINT_KWIDTH;
1961  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
1962  unsigned int bw, bh;
1963  GetBlockConfiguration(nblock2, bw, bh);
1964  dim3 grid2(bw, bh), block2(bsize2);
1965  PBA_BIND_TEX1D(tex_jtjd_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
1966 
1967  if (jp.GetDataSize() > MAX_TEXSIZE) {
1968  PBA_BIND_TEX1D_2(tex_jtjd_jp, tex_jtjd_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
1969  jtjd_point_kernel<2><<<grid2, block2>>>(len2, (bw * bsize2),
1970  ((float4*)jtjd.data()) + 2 * ncam,
1971  ((float4*)jtjdi.data()) + 2 * ncam);
1972  } else {
1973  PBA_BIND_TEX1D(tex_jtjd_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
1974  jtjd_point_kernel<1><<<grid2, block2>>>(len2, (bw * bsize2),
1975  ((float4*)jtjd.data()) + 2 * ncam,
1976  ((float4*)jtjdi.data()) + 2 * ncam);
1977  }
1978  CheckErrorCUDA("ComputeDiagonal<Point>");
1979 }
1980 
1981 // for each
1982 template <bool SJ>
1983 __global__ void jtjd_cam_q_kernel(int num, int rowsz, float* qw, float4* diag) {
1984  int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y;
1985  int index = bindex + threadIdx.x;
1986  if (index >= num) return;
1987  int tid = index & 0x1;
1988  float w = qw[index], ws = w * w * 2.0f;
1989  if (SJ) {
1990  float4 sj = tex1Dfetch(tex_jacobian_sj, index);
1991  float4 dj = tid == 0 ? make_float4(sj.x * sj.x * ws, 0, 0, 0)
1992  : make_float4(0, 0, 0, sj.w * sj.w * ws);
1993  diag[index] = dj;
1994  } else {
1995  float4 dj = tid == 0 ? make_float4(ws, 0, 0, 0) : make_float4(0, 0, 0, ws);
1996  diag[index] = dj;
1997  }
1998 }
1999 
2000 void ProgramCU::ComputeDiagonalQ(CuTexImage& qlistw, CuTexImage& sj,
2001  CuTexImage& diag) {
2002  unsigned int bsize = 32;
2003  unsigned int len = qlistw.GetImgWidth() * 2;
2004  unsigned int nblock = (len + bsize - 1) / bsize;
2005  unsigned int bw, bh;
2006  GetBlockConfiguration(nblock, bw, bh);
2007  dim3 grid(bw, bh), block(bsize);
2008  if (sj.IsValid()) {
2009  PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
2010  jtjd_cam_q_kernel<true><<<grid, block>>>(len, (bw * bsize), qlistw.data(),
2011  (float4*)diag.data());
2012  } else {
2013  jtjd_cam_q_kernel<false><<<grid, block>>>(len, (bw * bsize), qlistw.data(),
2014  (float4*)diag.data());
2015  }
2016  CheckErrorCUDA("ComputeDiagonalQ");
2017 }
2018 
2019 template <int VN, int KH, bool JT>
2020 __global__ void jtjd_cam_block_vec32_kernel(int num, float lambda1,
2021  float lambda2, float* jc,
2022  float* diag, float* blocks,
2023  bool add_existing_diagc) {
2024  __shared__ float value[KH * 32 * VN];
2025 
2026  // 8thread per camera
2027  int cam = blockIdx.x * KH + threadIdx.y;
2028  int part = threadIdx.x & 0x7; // which parameter of this camera
2029  int part2 = threadIdx.x & 0xf;
2030  int index = threadIdx.x + (threadIdx.y << 5);
2031  float row[8] = {0, 0, 0, 0, 0, 0, 0, 0};
2032  if (cam < num) {
2033  int rowpos = index - part;
2034  // read data range for this camera
2035  // 8 thread will do the same thing
2036  int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4; // first camera
2037  int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4; // last camera + 1
2038 
2039  // loop to read the index of the projection.
2040  // so to get the location to read the jacobian
2041  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
2042  if (JT) {
2043  float temp = jc[i];
2044  value[index] = temp;
2045  for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]);
2046  } else {
2047  int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4;
2048  float temp = jc[ii + part2];
2049  value[index] = temp;
2050  for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]);
2051  }
2052  }
2053  }
2054  __syncthreads();
2055 
2056  if (cam >= num) return;
2057  // save all the results?
2058  for (int i = 0; i < VN; ++i) value[index * VN + i] = row[i];
2059  int campos = threadIdx.y * (32 * VN);
2060  for (int i = threadIdx.x; i < (VN * 16); i += 32)
2061  value[campos + i] += value[campos + i + (16 * VN)];
2062  for (int i = threadIdx.x; i < (VN * 8); i += 32)
2063  value[campos + i] += value[campos + i + (8 * VN)];
2064 
2065  if (VN == 7) {
2066  bool zero = (part >= VN);
2067 
2068  // write back
2069  if (threadIdx.x < 8) {
2070  float* dp = value + campos + threadIdx.x * (VN + 1);
2071  float temp = zero ? 0 : dp[0];
2072  int didx = threadIdx.x + (cam << 3);
2073  if (add_existing_diagc) temp += diag[didx];
2074  diag[didx] = temp;
2075  dp[0] = lambda1 + lambda2 * temp;
2076  }
2077  int wpos = cam * (8 * VN) + threadIdx.x;
2078  int rpos = campos + threadIdx.x - (threadIdx.x >> 3);
2079  blocks[wpos] = zero ? 0 : value[rpos];
2080  if (threadIdx.x < (VN * 8 - 32))
2081  blocks[wpos + 32] = zero ? 0 : value[rpos + 28];
2082  } else {
2083  // write back
2084  if (threadIdx.x < 8) {
2085  float* dp = value + campos + threadIdx.x * (VN + 1);
2086  float temp = dp[0];
2087  int didx = threadIdx.x + (cam << 3);
2088  if (add_existing_diagc) temp += diag[didx];
2089  diag[didx] = temp;
2090  dp[0] = lambda1 + lambda2 * temp; // max(, 1e-6) ;
2091  }
2092  int wpos = cam * (8 * VN) + threadIdx.x;
2093  int rpos = campos + threadIdx.x;
2094  blocks[wpos] = value[rpos];
2095  blocks[wpos + 32] = value[rpos + 32];
2096  }
2097 }
2098 
2099 #define JTJD_POINT_BLOCK_KWIDTH 64
2100 
2101 template <int TEXN>
2102 __global__ void jtjd_point_block_kernel(int num, int rowsz, float lambda1,
2103  float lambda2, float4* diag,
2104  float4* blocks) {
2105  ////////////////////////////
2106  int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
2107  if (index >= num) return;
2108 
2109  int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first camera
2110  int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last camera + 1
2111 
2112  float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
2113  for (int i = idx1; i < idx2; ++i) {
2114  if (TEXN == 2 && i > 0xffffff) {
2115  float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1);
2116  M00 += j1.x * j1.x;
2117  M01 += j1.x * j1.y;
2118  M02 += j1.x * j1.z;
2119  M11 += j1.y * j1.y;
2120  M12 += j1.y * j1.z;
2121  M22 += j1.z * j1.z;
2122 
2123  float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1));
2124  M00 += j2.x * j2.x;
2125  M01 += j2.x * j2.y;
2126  M02 += j2.x * j2.z;
2127  M11 += j2.y * j2.y;
2128  M12 += j2.y * j2.z;
2129  M22 += j2.z * j2.z;
2130  } else {
2131  float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1);
2132  M00 += j1.x * j1.x;
2133  M01 += j1.x * j1.y;
2134  M02 += j1.x * j1.z;
2135  M11 += j1.y * j1.y;
2136  M12 += j1.y * j1.z;
2137  M22 += j1.z * j1.z;
2138 
2139  float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1));
2140  M00 += j2.x * j2.x;
2141  M01 += j2.x * j2.y;
2142  M02 += j2.x * j2.z;
2143  M11 += j2.y * j2.y;
2144  M12 += j2.y * j2.z;
2145  M22 += j2.z * j2.z;
2146  }
2147  }
2148 
2149  diag[index] = make_float4(M00, M11, M22, 0);
2150 
2151  M00 = lambda2 * M00 + lambda1;
2152  M11 = lambda2 * M11 + lambda1;
2153  M22 = lambda2 * M22 + lambda1;
2154 
2155  // invert the 3x3 matrix.
2156  float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 -
2157  M02 * M02 * M11 - M12 * M12 * M00;
2158  if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
2159  int write_pos = index * 3;
2160  blocks[write_pos] = make_float4(0, 0, 0, 0);
2161  blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
2162  blocks[write_pos + 2] = make_float4(0, 0, 0, 0);
2163  } else {
2164  float m00 = (M11 * M22 - M12 * M12) / det;
2165  float m01 = -(M01 * M22 - M12 * M02) / det;
2166  float m02 = (M01 * M12 - M02 * M11) / det;
2167  int write_pos = index * 3;
2168  blocks[write_pos] = make_float4(m00, m01, m02, 0);
2169 
2170  float m11 = (M00 * M22 - M02 * M02) / det;
2171  float m12 = -(M00 * M12 - M01 * M02) / det;
2172  blocks[write_pos + 1] = make_float4(m01, m11, m12, 0);
2173 
2174  float m22 = (M00 * M11 - M01 * M01) / det;
2175  blocks[write_pos + 2] = make_float4(m02, m12, m22, 0);
2176  }
2177 }
2178 
2179 #define JTJD_BLOCK_CAM_INVERT_KWIDTH 64
2180 template <int VN>
2181 __global__ void jtjd_cam_block_invert_kernel(int num, float4* blocks) {
2182  // N / 8 cameras...each have 64 floats,,,, N * 8 float
2183  // each will read 8 float......
2184  __shared__ float value[JTJD_BLOCK_CAM_INVERT_KWIDTH * VN];
2185  __shared__ bool invalid[JTJD_BLOCK_CAM_INVERT_KWIDTH / 8];
2186  //////////////////////////////////////////////
2187 
2188  int bindex = IMUL(blockIdx.x, blockDim.x);
2189  int index = bindex + threadIdx.x;
2190  int block_read_pos = IMUL(bindex, VN);
2191  for (int i = 0; i < JTJD_BLOCK_CAM_INVERT_KWIDTH * VN;
2192  i += JTJD_BLOCK_CAM_INVERT_KWIDTH)
2193  value[threadIdx.x + i] = ((float*)blocks)[block_read_pos + threadIdx.x + i];
2194  __syncthreads();
2195  const int cam_id = threadIdx.x >> 3;
2196  const int cam_pos = IMUL(cam_id, VN * 8);
2197  const int col = threadIdx.x & 0x7, rowj_pos = col << 3;
2198  ; //
2199 
2200  float* a = value + cam_pos;
2201  for (int i = 0; i < VN; ++i) {
2202  int rowi_pos = i << 3, dpos = i + rowi_pos;
2203  if (col == i && a[dpos] > 0) a[dpos] = rsqrt(a[dpos]);
2204  __syncthreads();
2205  float diag = a[dpos];
2206  if (diag == 0 || col >= VN) continue;
2207  if (col < i) {
2208  a[rowi_pos + col] = 0;
2209  } else if (col > i) {
2210  float aij = a[rowi_pos + col] * diag;
2211  a[rowi_pos + col] = aij;
2212  for (int k = col; k < VN; ++k) a[rowj_pos + k] -= a[rowi_pos + k] * aij;
2213  }
2214  }
2215 
2216  if (index >= num) return;
2217 
2218  if (col == 0) invalid[cam_id] = false;
2219  if (col < VN) {
2220  for (int i = 1; i < VN; ++i) {
2221  int rowi_pos = i << 3, dpos = i + rowi_pos;
2222  if (a[dpos] == 0) continue;
2223  if (col < i) {
2224  float sum = 0;
2225  for (int k = col; k < i; ++k)
2226  sum += (a[(k << 3) + i] * a[rowj_pos + k]);
2227  a[rowj_pos + i] = -sum * a[dpos];
2228  }
2229  }
2230  float ai[8], amax = 0;
2231  for (int i = 0; i < VN * 8; i += 8) {
2232  float sum = 0;
2233  for (int k = 0; k < VN; k++) sum += a[rowj_pos + k] * a[i + k];
2234  ai[i >> 3] = sum;
2235  amax = max(amax, sum);
2236  }
2237 
2238  if (isinf(amax)) invalid[cam_id] = true;
2239  int write_pos = IMUL((index >> 3), (VN * 2)) + (col << 1);
2240  if (invalid[cam_id]) // a better way would be using a threshold
2241  {
2242  blocks[write_pos] = make_float4(0, 0, 0, 0);
2243  blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
2244  } else {
2245  blocks[write_pos] = make_float4(ai[0], ai[1], ai[2], ai[3]);
2246  blocks[write_pos + 1] =
2247  make_float4(ai[4], ai[5], ai[6], VN < 8 ? 0 : ai[7]);
2248  }
2249  }
2250 }
2251 
2252 void ProgramCU::ComputeDiagonalBlock(float lambda, bool dampd, CuTexImage& jc,
2253  CuTexImage& cmap, CuTexImage& jp,
2254  CuTexImage& pmap, CuTexImage& cmlist,
2255  CuTexImage& diag, CuTexImage& blocks,
2256  int radial_distortion, bool jc_transpose,
2257  bool add_existing_diagc, int mode) {
2258  size_t szjc = jc.GetDataSize();
2259  unsigned int ncam = (cmap.GetImgWidth() - 1); // how many cameras
2260  float lambda1 = dampd ? 0.0f : lambda;
2261  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
2262  const unsigned int bheight = 2;
2263  dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight);
2264  PBA_BIND_TEX1D(tex_jtjd_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
2265 
2266  if (mode == 2) {
2267  // point only mode?
2268  } else if (radial_distortion) {
2269  if (jc_transpose) {
2270  jtjd_cam_block_vec32_kernel<8, bheight, true><<<grid1x, block1x>>>(
2271  ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2272  add_existing_diagc);
2273  } else {
2274  PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
2275  jtjd_cam_block_vec32_kernel<8, bheight, false><<<grid1x, block1x>>>(
2276  ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2277  add_existing_diagc);
2278  }
2279  } else {
2280  if (jc_transpose) {
2281  jtjd_cam_block_vec32_kernel<7, bheight, true><<<grid1x, block1x>>>(
2282  ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2283  add_existing_diagc);
2284  } else {
2285  PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
2286  jtjd_cam_block_vec32_kernel<7, bheight, false><<<grid1x, block1x>>>(
2287  ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2288  add_existing_diagc);
2289  }
2290  }
2291  CheckErrorCUDA("ComputeDiagonalBlock<Camera>");
2292 
2293  ////////////////////////////////////////////
2294  unsigned int npoint = (pmap.GetImgWidth() - 1);
2295  unsigned int len2 = npoint;
2296  unsigned int bsize2 = JTJD_POINT_BLOCK_KWIDTH;
2297  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
2298  unsigned int bw, bh;
2299  unsigned int offsetd = 2 * ncam;
2300  unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam;
2301  GetBlockConfiguration(nblock2, bw, bh);
2302  dim3 grid2(bw, bh), block2(bsize2);
2303  PBA_BIND_TEX1D(tex_jtjd_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
2304  if (mode == 1) {
2305  // camera only mode?
2306  } else if (jp.GetDataSize() > MAX_TEXSIZE) {
2307  PBA_BIND_TEX1D_2(tex_jtjd_jp, tex_jtjd_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
2308  jtjd_point_block_kernel<2><<<grid2, block2>>>(
2309  len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd,
2310  ((float4*)blocks.data()) + offsetb);
2311  } else {
2312  PBA_BIND_TEX1D(tex_jtjd_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2313  jtjd_point_block_kernel<1><<<grid2, block2>>>(
2314  len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd,
2315  ((float4*)blocks.data()) + offsetb);
2316  }
2317  CheckErrorCUDA("ComputeDiagonalBlock<Point>");
2318 
2319  if (mode != 2) {
2320  unsigned int len3 = ncam * 8;
2321  unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH;
2322  unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3;
2323  dim3 grid3(nblock3), block3(bsize3);
2324  if (radial_distortion)
2325  jtjd_cam_block_invert_kernel<8><<<grid3, block3>>>(
2326  len3, (float4*)blocks.data());
2327  else
2328  jtjd_cam_block_invert_kernel<7><<<grid3, block3>>>(
2329  len3, (float4*)blocks.data());
2330  CheckErrorCUDA("ComputeDiagonalBlockInverse<Camera>");
2331  }
2332 }
2333 
2334 template <int WIDTH, int BBIT, int VSZ>
2335 __global__ void multiply_block_conditioner_kernel(int num, int rowsz,
2336  float* blocks, float* x,
2337  float* result) {
2338  __shared__ float mat[WIDTH * VSZ];
2339  __shared__ float val[WIDTH];
2340  const int BSZ = 1 << BBIT;
2341  const int BMASK = BSZ - 1;
2342  int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y;
2343  int index = bindex + threadIdx.x;
2344  int block_read_pos = bindex * VSZ;
2345  val[threadIdx.x] = x[index];
2346  for (int i = 0; i < VSZ * WIDTH; i += WIDTH)
2347  mat[i + threadIdx.x] = blocks[i + block_read_pos + threadIdx.x];
2348  __syncthreads();
2349  if (index >= num) return;
2350  float* ac = mat + (threadIdx.x >> BBIT) * (BSZ * VSZ) + (threadIdx.x & BMASK);
2351  float* xc = val + (threadIdx.x & (~BMASK));
2352  float sum = 0;
2353  for (int i = 0; i < VSZ; ++i) sum += ac[i << BBIT] * xc[i];
2354  result[index] = sum; // isinf(sum) ? 0 : sum ; //
2355 }
2356 
2357 void ProgramCU::MultiplyBlockConditioner(int ncam, int npoint,
2358  CuTexImage& blocks, CuTexImage& vector,
2359  CuTexImage& result, int radial,
2360  int mode) {
2361  const unsigned int bsize1 = 64;
2362  unsigned int bw, bh;
2363 
2364  if (mode != 2) {
2365  unsigned int len1 = ncam * 8;
2366  unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
2367  GetBlockConfiguration(nblock1, bw, bh);
2368  dim3 grid1(bw, bh), block1(bsize1);
2369  if (radial)
2370  multiply_block_conditioner_kernel<bsize1, 3, 8><<<grid1, block1>>>(
2371  len1, (bw * bsize1), blocks.data(), vector.data(), result.data());
2372  else
2373  multiply_block_conditioner_kernel<bsize1, 3, 7><<<grid1, block1>>>(
2374  len1, (bw * bsize1), blocks.data(), vector.data(), result.data());
2375  CheckErrorCUDA("MultiplyBlockConditioner<Camera>");
2376  }
2377 
2378  if (mode != 1) {
2379  const unsigned int bsize2 = 128;
2380  unsigned int len2 = npoint * 4;
2381  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
2382  unsigned int cbsz = radial ? 64 : 56;
2383  unsigned int offsetb = ncam * cbsz;
2384  unsigned int offsetd = ncam * 8;
2385  GetBlockConfiguration(nblock2, bw, bh);
2386  dim3 grid2(bw, bh), block2(bsize2);
2387  multiply_block_conditioner_kernel<bsize2, 2, 3><<<grid2, block2>>>(
2388  len2, (bw * bsize2), blocks.data() + offsetb, vector.data() + offsetd,
2389  result.data() + offsetd);
2390  CheckErrorCUDA("MultiplyBlockConditioner<Point>");
2391  }
2392 }
2393 
2394 
2395 template <int TEXN>
2396 __global__ void shuffle_camera_jacobian_kernel(int num, int bwidth,
2397  float4* jc) {
2398  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2399  if (index >= num) return;
2400  int fetch_idx = tex1Dfetch(tex_shuffle_map, index >> 2);
2401  if (TEXN == 2) {
2402  int texidx = fetch_idx >> 23,
2403  fidx = ((fetch_idx & 0x7fffff) << 2) + (index & 0x3);
2404  if (texidx == 0)
2405  jc[index] = tex1Dfetch(tex_shuffle_jc, fidx);
2406  else if (texidx == 1)
2407  jc[index] = tex1Dfetch(tex_shuffle_jc2, fidx);
2408  }
2409  if (TEXN == 1) {
2410  jc[index] = tex1Dfetch(tex_shuffle_jc, (fetch_idx << 2) + (index & 0x3));
2411  }
2412 }
2413 
2414 bool ProgramCU::ShuffleCameraJacobian(CuTexImage& jc, CuTexImage& map,
2415  CuTexImage& result) {
2416  if (!result.IsValid()) return false;
2417  size_t szjc = jc.GetDataSize();
2418  unsigned int len = map.GetImgWidth() * 4;
2419  unsigned int bsize = 128;
2420  unsigned int nblock = (len + bsize - 1) / bsize;
2421 
2422  PBA_BIND_TEX1D(tex_shuffle_map, map, cudaReadModeElementType, PBA_ChanInt());
2423 
2424  if (szjc > 2 * MAX_TEXSIZE) {
2425  fprintf(stderr, "datasize way too big %lX, %lX+...\n", szjc,
2426  (szjc) / MAX_TEXSIZE);
2427  return false;
2428  } else if (szjc > MAX_TEXSIZE) {
2429  unsigned int bw, bh;
2430  GetBlockConfiguration(nblock, bw, bh);
2431  dim3 grid(bw, bh), block(bsize);
2432  PBA_BIND_TEX1D_2(tex_shuffle_jc, tex_shuffle_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
2433  shuffle_camera_jacobian_kernel<2><<<grid, block>>>(len, (bw * bsize),
2434  (float4*)result.data());
2435  } else {
2436  PBA_BIND_TEX1D(tex_shuffle_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
2437  unsigned int bw, bh;
2438  GetBlockConfiguration(nblock, bw, bh);
2439  dim3 grid(bw, bh), block(bsize);
2440  shuffle_camera_jacobian_kernel<1><<<grid, block>>>(len, (bw * bsize),
2441  (float4*)result.data());
2442  }
2443  CheckErrorCUDA("ShuffleCameraJacobian");
2444  return true;
2445 }
2446 
2447 
2448 
2449 template <int TEXN>
2450 __global__ void multiply_jx_kernel(int num, int bwidth, int offset,
2451  float* result) {
2452  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2453  if (index >= num) return;
2454 
2455  if (TEXN == 4 && (index >> 24) == 3) {
2456  ////////////////////////////////////////////
2457  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2458  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2459  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2460  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2461 
2462  ////////////////////////////////////////////
2463  float4 jp, jc1, jc2;
2464  jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
2465  jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1);
2466  jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1);
2467 
2468  /////////////////////////////////////
2469  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2470  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2471  jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2472  jp.z * xp.z;
2473  } else if (TEXN > 2 && (index >> 24) == 2) {
2474  ////////////////////////////////////////////
2475  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2476  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2477  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2478  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2479 
2480  ////////////////////////////////////////////
2481  float4 jp, jc1, jc2;
2482  jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
2483  jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1);
2484  jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1);
2485 
2486  /////////////////////////////////////
2487  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2488  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2489  jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2490  jp.z * xp.z;
2491  } else if (TEXN > 1 && (index > 0xffffff)) {
2492  ////////////////////////////////////////////
2493  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2494  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2495  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2496  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2497 
2498  ////////////////////////////////////////////
2499  float4 jp, jc1, jc2;
2500  jp = tex1Dfetch(tex_mjx_jp, index & 0x1ffffff);
2501  jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1);
2502  jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1);
2503 
2504  /////////////////////////////////////
2505  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2506  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2507  jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2508  jp.z * xp.z;
2509  } else {
2510  ////////////////////////////////////////////
2511  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2512  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2513  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2514  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2515 
2516  ////////////////////////////////////////////
2517  float4 jp, jc1, jc2;
2518  jp = tex1Dfetch(tex_mjx_jp, index);
2519  jc1 = tex1Dfetch(tex_mjx_jc, index << 1);
2520  jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1);
2521 
2522  /////////////////////////////////////
2523  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2524  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2525  jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2526  jp.z * xp.z;
2527  }
2528 }
2529 
2530 template <int TEXN>
2531 __global__ void multiply_jcx_kernel(int num, int bwidth, float* result) {
2532  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2533  if (index >= num) return;
2534 
2535  if (TEXN == 4 && (index >> 24) == 3) {
2536  ////////////////////////////////////////////
2537  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2538  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2539  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2540 
2541  ////////////////////////////////////////////
2542  float4 jc1, jc2;
2543  jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1);
2544  jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1);
2545 
2546  /////////////////////////////////////
2547  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2548  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2549  jc2.z * xc2.z + jc2.w * xc2.w;
2550  } else if (TEXN > 2 && (index >> 24) == 2) {
2551  ////////////////////////////////////////////
2552  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2553  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2554  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2555 
2556  ////////////////////////////////////////////
2557  float4 jc1, jc2;
2558  jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1);
2559  jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1);
2560 
2561  /////////////////////////////////////
2562  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2563  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2564  jc2.z * xc2.z + jc2.w * xc2.w;
2565  } else if (TEXN > 1 && (index > 0xffffff)) {
2566  ////////////////////////////////////////////
2567  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2568  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2569  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2570 
2571  ////////////////////////////////////////////
2572  float4 jc1, jc2;
2573  jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1);
2574  jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1);
2575 
2576  /////////////////////////////////////
2577  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2578  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2579  jc2.z * xc2.z + jc2.w * xc2.w;
2580  } else {
2581  ////////////////////////////////////////////
2582  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2583  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2584  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2585 
2586  ////////////////////////////////////////////
2587  float4 jc1, jc2;
2588  jc1 = tex1Dfetch(tex_mjx_jc, index << 1);
2589  jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1);
2590 
2591  /////////////////////////////////////
2592  result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2593  jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2594  jc2.z * xc2.z + jc2.w * xc2.w;
2595  }
2596 }
2597 
2598 template <int TEXN>
2599 __global__ void multiply_jpx_kernel(int num, int bwidth, int offset,
2600  float* result) {
2601  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2602  if (index >= num) return;
2603 
2604  if (TEXN == 2 && index > 0x1ffffff) {
2605  ////////////////////////////////////////////
2606  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2607  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2608  ////////////////////////////////////////////
2609  float4 jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
2610  /////////////////////////////////////
2611  result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z;
2612  } else {
2613  ////////////////////////////////////////////
2614  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2615  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2616 
2617  ////////////////////////////////////////////
2618  float4 jp = tex1Dfetch(tex_mjx_jp, index);
2619  /////////////////////////////////////
2620  result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z;
2621  }
2622 }
2623 
2624 template <int KW>
2625 __global__ void multiply_jx_notex2_kernel(int num, int bwidth, int offset,
2626  float* jcx, float* jpx,
2627  float* result) {
2628  int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2629  int index = threadIdx.x + bindex;
2630 
2631  ////////////////////////////////////////////
2632  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2633  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2634  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2635  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2636  ////////////////////////////////////////////
2637  __shared__ float jps[KW * 4];
2638  __shared__ float jcs[KW * 8];
2639 
2640  for (int i = threadIdx.x; i < 4 * KW; i += KW)
2641  jps[i] = jpx[(bindex << 2) + i];
2642  for (int i = threadIdx.x; i < 8 * KW; i += KW)
2643  jcs[i] = jcx[(bindex << 3) + i];
2644 
2645  __syncthreads();
2646  if (index >= num) return;
2647 
2648  /////////////////////////////////////
2649  float *jp = jps + threadIdx.x * 4, *jc = jcs + threadIdx.x * 8;
2650  result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z +
2651  jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y +
2652  jc[6] * xc2.z + jc[7] * xc2.w + jp[0] * xp.x + jp[1] * xp.y +
2653  jp[2] * xp.z;
2654 }
2655 
2656 template <int KW>
2657 __global__ void multiply_jpx_notex2_kernel(int num, int bwidth, int offset,
2658  float* jpx, float* result) {
2659  int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2660  int index = threadIdx.x + bindex;
2661 
2662  ////////////////////////////////////////////
2663  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2664  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2665  ////////////////////////////////////////////
2666  __shared__ float jps[KW * 4];
2667 
2668  for (int i = threadIdx.x; i < 4 * KW; i += KW)
2669  jps[i] = jpx[(bindex << 2) + i];
2670 
2671  __syncthreads();
2672  if (index >= num) return;
2673 
2674  /////////////////////////////////////
2675  float* jp = jps + threadIdx.x * 4;
2676  result[index] = jp[0] * xp.x + jp[1] * xp.y + jp[2] * xp.z;
2677 }
2678 
2679 template <int KW>
2680 __global__ void multiply_jcx_notex2_kernel(int num, int bwidth, float* jcx,
2681  float* result) {
2682  int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2683  int index = threadIdx.x + bindex;
2684 
2685  ////////////////////////////////////////////
2686  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2687  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2688  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2689  ////////////////////////////////////////////
2690 
2691  __shared__ float jcs[KW * 8];
2692  for (int i = threadIdx.x; i < 8 * KW; i += KW)
2693  jcs[i] = jcx[(bindex << 3) + i];
2694 
2695  __syncthreads();
2696  if (index >= num) return;
2697 
2698  /////////////////////////////////////
2699  float* jc = jcs + threadIdx.x * 8;
2700  result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z +
2701  jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y +
2702  jc[6] * xc2.z + jc[7] * xc2.w;
2703 }
2704 
2705 void ProgramCU::ComputeJX(int point_offset, CuTexImage& x, CuTexImage& jc,
2706  CuTexImage& jp, CuTexImage& jmap, CuTexImage& result,
2707  int mode) {
2708  // given a vector of parameters....
2709  // multiply the Jacobian Matrix with it [jc jp] * p
2710  // for each measurment, read back the jacobian
2711  // multiply and summ up th corresponding
2712 
2713  unsigned int nproj = jmap.GetImgWidth();
2714  unsigned int len = nproj * 2;
2715  unsigned int bsize = 64;
2716  unsigned int nblock = (len + bsize - 1) / bsize;
2717  unsigned int bw, bh;
2718  PBA_BIND_TEX1D(tex_mjx_idx, jmap, cudaReadModeElementType, PBA_ChanInt2());
2719  PBA_BIND_TEX1D(tex_mjx_x, x, cudaReadModeElementType, PBA_ChanFloat4());
2720 
2721  if (mode == 0) {
2722  size_t szjc = jc.GetDataSize();
2723  if (TEX_TOOBIG4(szjc)) {
2724  GetBlockConfiguration(nblock, bw, bh);
2725  dim3 grid(bw, bh), block(bsize);
2726  multiply_jx_notex2_kernel<64><<<grid, block>>>(
2727  len, (bw * bsize), point_offset, jc.data(), jp.data(), result.data());
2728  } else if (szjc > 2 * MAX_TEXSIZE) {
2729  PBA_BIND_TEX1D_2(tex_mjx_jp, tex_mjx_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
2730  PBA_BIND_TEX1D_4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4, jc, cudaReadModeElementType, PBA_ChanFloat4());
2731  GetBlockConfiguration(nblock, bw, bh);
2732  dim3 grid(bw, bh), block(bsize);
2733  multiply_jx_kernel<4><<<grid, block>>>(len, (bw * bsize), point_offset,
2734  result.data());
2735  } else if (szjc > MAX_TEXSIZE) {
2736  PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2737  PBA_BIND_TEX1D_2(tex_mjx_jc, tex_mjx_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
2738  GetBlockConfiguration(nblock, bw, bh);
2739  dim3 grid(bw, bh), block(bsize);
2740  multiply_jx_kernel<2><<<grid, block>>>(len, (bw * bsize), point_offset,
2741  result.data());
2742  } else {
2743  PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2744  PBA_BIND_TEX1D(tex_mjx_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
2745  GetBlockConfiguration(nblock, bw, bh);
2746  dim3 grid(bh, bw), block(bsize);
2747  multiply_jx_kernel<1><<<grid, block>>>(len, (bh * bsize), point_offset,
2748  result.data());
2749  }
2750  CheckErrorCUDA("ComputeJX");
2751  } else if (mode == 1) {
2752  size_t szjc = jc.GetDataSize();
2753  if (TEX_TOOBIG4(szjc)) {
2754  GetBlockConfiguration(nblock, bw, bh);
2755  dim3 grid(bw, bh), block(bsize);
2756  multiply_jcx_notex2_kernel<64><<<grid, block>>>(len, (bw * bsize),
2757  jc.data(), result.data());
2758  } else if (szjc > 2 * MAX_TEXSIZE) {
2759  PBA_BIND_TEX1D_4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4, jc, cudaReadModeElementType, PBA_ChanFloat4());
2760  GetBlockConfiguration(nblock, bw, bh);
2761  dim3 grid(bw, bh), block(bsize);
2762  multiply_jcx_kernel<4><<<grid, block>>>(len, (bw * bsize), result.data());
2763  } else if (szjc > MAX_TEXSIZE) {
2764  PBA_BIND_TEX1D_2(tex_mjx_jc, tex_mjx_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
2765  GetBlockConfiguration(nblock, bw, bh);
2766  dim3 grid(bw, bh), block(bsize);
2767  multiply_jcx_kernel<2><<<grid, block>>>(len, (bw * bsize), result.data());
2768  } else {
2769  PBA_BIND_TEX1D(tex_mjx_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
2770  GetBlockConfiguration(nblock, bw, bh);
2771  dim3 grid(bh, bw), block(bsize);
2772  multiply_jcx_kernel<1><<<grid, block>>>(len, (bh * bsize), result.data());
2773  }
2774  CheckErrorCUDA("ComputeJCX");
2775  } else if (mode == 2) {
2776  size_t szjp = jp.GetDataSize();
2777  if (szjp > MAX_TEXSIZE) {
2778  PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2779  GetBlockConfiguration(nblock, bw, bh);
2780  dim3 grid(bw, bh), block(bsize);
2781  multiply_jpx_kernel<2><<<grid, block>>>(len, (bw * bsize), point_offset,
2782  result.data());
2783  } else {
2784  PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2785  GetBlockConfiguration(nblock, bw, bh);
2786  dim3 grid(bh, bw), block(bsize);
2787  multiply_jpx_kernel<1><<<grid, block>>>(len, (bh * bsize), point_offset,
2788  result.data());
2789  }
2790  CheckErrorCUDA("ComputeJPX");
2791  }
2792 }
2793 
2794 template <bool md, bool pd>
2795 __device__ void jacobian_internal(int camera_pos, int pt_pos, int tidx,
2796  float* r, float jic, float* jxc, float* jyc,
2797  float* jxp, float* jyp) {
2798  float m[3];
2799  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
2800  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
2801  r[0] = r1.x;
2802  r[1] = r1.y;
2803  r[2] = r1.z;
2804  r[3] = r1.w;
2805  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
2806  r[4] = r2.x;
2807  r[5] = r2.y;
2808  r[6] = r2.z;
2809  r[7] = r2.w;
2810  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
2811  r[8] = r3.x;
2812 
2813  float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
2814  m[0] = temp.x;
2815  m[1] = temp.y;
2816  m[2] = temp.z;
2817 
2818  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
2819  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
2820  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
2821  float f_p2 = FDIV(ft.x, z0 + ft.w);
2822  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
2823  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
2824 
2825  if (pd) {
2826  float rr1 = r3.y * p0_p2 * p0_p2;
2827  float rr2 = r3.y * p1_p2 * p1_p2;
2828  float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
2829  float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
2830 
2831  JACOBIAN_SET_JC_BEGIN
2832  float jfc = jic * (1 + rr1 + rr2);
2833  float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
2834  /////////////////////////////////////////////////////
2835  jxc[0] = p0_p2 * jfc;
2836  jxc[1] = f_p2_x;
2837  jxc[2] = 0;
2838  jxc[3] = -f_p2_x * p0_p2;
2839  jxc[4] = -f_p2_x * p0_p2 * y0;
2840  jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
2841  jxc[6] = -f_p2_x * y0;
2842  jxc[7] = ft_x_pn * p0_p2;
2843 
2844  jyc[0] = p1_p2 * jfc;
2845  jyc[1] = 0;
2846  jyc[2] = f_p2_y;
2847  jyc[3] = -f_p2_y * p1_p2;
2848  jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
2849  jyc[5] = f_p2_y * x0 * p1_p2;
2850  jyc[6] = f_p2_y * x0;
2851  jyc[7] = ft_x_pn * p1_p2;
2852  JACOBIAN_SET_JC_END
2853  ///////////////////////////////////
2854  jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
2855  jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
2856  jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
2857  jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
2858  jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
2859  jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
2860  } else {
2861  JACOBIAN_SET_JC_BEGIN
2862  jxc[0] = p0_p2 * jic;
2863  jxc[1] = f_p2;
2864  jxc[2] = 0;
2865  jxc[3] = -f_p2 * p0_p2;
2866  jxc[4] = -f_p2 * p0_p2 * y0;
2867  jxc[5] = f_p2 * (z0 + x0 * p0_p2);
2868  jxc[6] = -f_p2 * y0;
2869 
2870  jyc[0] = p1_p2 * jic;
2871  jyc[1] = 0;
2872  jyc[2] = f_p2;
2873  jyc[3] = -f_p2 * p1_p2;
2874  jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
2875  jyc[5] = f_p2 * x0 * p1_p2;
2876  jyc[6] = f_p2 * x0;
2877 
2878  if (md) {
2879  float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
2880  float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
2881  jxc[7] = -ms.x * msn;
2882  jyc[7] = -ms.y * msn;
2883  } else {
2884  jxc[7] = 0;
2885  jyc[7] = 0;
2886  }
2887  JACOBIAN_SET_JC_END
2888  ///////////////////////////////////
2889  jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
2890  jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
2891  jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
2892  jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
2893  jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
2894  jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
2895  }
2896 }
2897 
2898 template <bool md, bool pd>
2899 __device__ void jacobian_camera_internal(int camera_pos, int pt_pos, int tidx,
2900  float* r, float jic, float* jxc,
2901  float* jyc) {
2902  float m[3];
2903  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
2904  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
2905  r[0] = r1.x;
2906  r[1] = r1.y;
2907  r[2] = r1.z;
2908  r[3] = r1.w;
2909  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
2910  r[4] = r2.x;
2911  r[5] = r2.y;
2912  r[6] = r2.z;
2913  r[7] = r2.w;
2914  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
2915  r[8] = r3.x;
2916 
2917  float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
2918  m[0] = temp.x;
2919  m[1] = temp.y;
2920  m[2] = temp.z;
2921 
2922  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
2923  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
2924  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
2925  float f_p2 = FDIV(ft.x, z0 + ft.w);
2926  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
2927  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
2928 #ifndef PBA_DISABLE_CONST_CAMERA
2929  if (r3.w != 0.0f) {
2930  jxc[0] = 0;
2931  jxc[1] = 0;
2932  jxc[2] = 0;
2933  jxc[3] = 0;
2934  jxc[4] = 0;
2935  jxc[5] = 0;
2936  jxc[6] = 0;
2937  jxc[7] = 0;
2938  jyc[0] = 0;
2939  jyc[1] = 0;
2940  jyc[2] = 0;
2941  jyc[3] = 0;
2942  jyc[4] = 0;
2943  jyc[5] = 0;
2944  jyc[6] = 0;
2945  jyc[7] = 0;
2946  } else
2947 #endif
2948  if (pd) {
2949  float rr1 = r3.y * p0_p2 * p0_p2;
2950  float rr2 = r3.y * p1_p2 * p1_p2;
2951  float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
2952  float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
2953  float jfc = jic * (1 + rr1 + rr2);
2954  float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
2955  /////////////////////////////////////////////////////
2956  jxc[0] = p0_p2 * jfc;
2957  jxc[1] = f_p2_x;
2958  jxc[2] = 0;
2959  jxc[3] = -f_p2_x * p0_p2;
2960  jxc[4] = -f_p2_x * p0_p2 * y0;
2961  jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
2962  jxc[6] = -f_p2_x * y0;
2963  jxc[7] = ft_x_pn * p0_p2;
2964 
2965  jyc[0] = p1_p2 * jfc;
2966  jyc[1] = 0;
2967  jyc[2] = f_p2_y;
2968  jyc[3] = -f_p2_y * p1_p2;
2969  jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
2970  jyc[5] = f_p2_y * x0 * p1_p2;
2971  jyc[6] = f_p2_y * x0;
2972  jyc[7] = ft_x_pn * p1_p2;
2973  } else {
2974  jxc[0] = p0_p2 * jic;
2975  jxc[1] = f_p2;
2976  jxc[2] = 0;
2977  jxc[3] = -f_p2 * p0_p2;
2978  jxc[4] = -f_p2 * p0_p2 * y0;
2979  jxc[5] = f_p2 * (z0 + x0 * p0_p2);
2980  jxc[6] = -f_p2 * y0;
2981 
2982  jyc[0] = p1_p2 * jic;
2983  jyc[1] = 0;
2984  jyc[2] = f_p2;
2985  jyc[3] = -f_p2 * p1_p2;
2986  jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
2987  jyc[5] = f_p2 * x0 * p1_p2;
2988  jyc[6] = f_p2 * x0;
2989 
2990  if (md) {
2991  float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
2992  float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
2993  jxc[7] = -ms.x * msn;
2994  jyc[7] = -ms.y * msn;
2995  } else {
2996  jxc[7] = 0;
2997  jyc[7] = 0;
2998  }
2999  }
3000 }
3001 
3002 template <bool pd>
3003 __device__ void jacobian_point_internal(int camera_pos, int pt_pos, int tidx,
3004  float* r, float* jxp, float* jyp) {
3005  float m[3];
3006  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
3007  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
3008  r[0] = r1.x;
3009  r[1] = r1.y;
3010  r[2] = r1.z;
3011  r[3] = r1.w;
3012  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
3013  r[4] = r2.x;
3014  r[5] = r2.y;
3015  r[6] = r2.z;
3016  r[7] = r2.w;
3017  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
3018  r[8] = r3.x;
3019 
3020  float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
3021  m[0] = temp.x;
3022  m[1] = temp.y;
3023  m[2] = temp.z;
3024 
3025  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
3026  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
3027  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
3028  float f_p2 = FDIV(ft.x, z0 + ft.w);
3029  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
3030  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
3031 
3032  if (pd) {
3033  float rr1 = r3.y * p0_p2 * p0_p2;
3034  float rr2 = r3.y * p1_p2 * p1_p2;
3035  float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
3036  float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
3037  ///////////////////////////////////
3038  jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
3039  jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
3040  jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
3041  jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
3042  jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
3043  jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
3044  } else {
3045  ///////////////////////////////////
3046  jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
3047  jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
3048  jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
3049  jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
3050  jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
3051  jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
3052  }
3053 }
3054 
3055 template <bool md, bool pd>
3056 __global__ void multiply_jx_noj_kernel(int num, int bwidth, int offset,
3057  float jic, float2* result) {
3058  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3059  if (index >= num) return;
3060 
3061  __shared__ float data[9 * 64];
3062  ////////////////////////////////////////////
3063  int2 proj = tex1Dfetch(tex_mjx_idx, index);
3064  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
3065  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
3066  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
3067 
3068  ////////////////////////////////////////////
3069  float jxc[8], jyc[8], jxp[3], jyp[3];
3070  jacobian_internal<md, pd>(proj.x << 1, proj.y, index, data + 9 * threadIdx.x,
3071  jic, jxc, jyc, jxp, jyp);
3072 
3073  /////////////////////////////////////
3074  result[index] = make_float2(
3075  jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w +
3076  jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w +
3077  jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z,
3078  jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w +
3079  jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w +
3080  jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z);
3081 }
3082 
3083 template <bool md, bool pd>
3084 __global__ void multiply_jcx_noj_kernel(int num, int bwidth, float jic,
3085  float2* result) {
3086  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3087  if (index >= num) return;
3088 
3089  __shared__ float data[9 * 64];
3090  ////////////////////////////////////////////
3091  int2 proj = tex1Dfetch(tex_mjx_idx, index);
3092  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
3093  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
3094 
3095  ////////////////////////////////////////////
3096  float jxc[8], jyc[8];
3097  jacobian_camera_internal<md, pd>(proj.x << 1, proj.y, index,
3098  data + 9 * threadIdx.x, jic, jxc, jyc);
3099 
3100  /////////////////////////////////////
3101  result[index] = make_float2(
3102  jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w +
3103  jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w,
3104  jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w +
3105  jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w);
3106 }
3107 
3108 template <bool pd>
3109 __global__ void multiply_jpx_noj_kernel(int num, int bwidth, int offset,
3110  float2* result) {
3111  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3112  if (index >= num) return;
3113 
3114  __shared__ float data[9 * 64];
3115  ////////////////////////////////////////////
3116  int2 proj = tex1Dfetch(tex_mjx_idx, index);
3117  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
3118 
3119  ////////////////////////////////////////////
3120  float jxp[3], jyp[3];
3121  jacobian_point_internal<pd>(proj.x << 1, proj.y, index,
3122  data + 9 * threadIdx.x, jxp, jyp);
3123 
3124  /////////////////////////////////////
3125  result[index] = make_float2(jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z,
3126  jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z);
3127 }
3128 
3129 void ProgramCU::ComputeJX_(CuTexImage& x, CuTexImage& jx, CuTexImage& camera,
3130  CuTexImage& point, CuTexImage& meas,
3131  CuTexImage& pjmap, bool intrinsic_fixed,
3132  int radial_distortion, int mode) {
3133  unsigned int nproj = pjmap.GetImgWidth();
3134  unsigned int len = nproj;
3135  unsigned int bsize = 64;
3136  unsigned int nblock = (len + bsize - 1) / bsize;
3137  unsigned int bw, bh;
3138  int point_offset = camera.GetImgWidth() * 2;
3139  float jfc = intrinsic_fixed ? 0 : 1.0f;
3140 
3141  /////////////////////////////
3142  PBA_BIND_TEX1D(tex_mjx_idx, pjmap, cudaReadModeElementType, PBA_ChanInt2());
3143  PBA_BIND_TEX1D(tex_mjx_x, x, cudaReadModeElementType, PBA_ChanFloat4());
3144  PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3145  PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
3146 
3147  ///////////////////////////////////
3148  GetBlockConfiguration(nblock, bw, bh);
3149  dim3 grid(bw, bh), block(bsize);
3150 
3151  if (mode == 0) {
3152  if (radial_distortion == -1) {
3153  PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3154  multiply_jx_noj_kernel<true, false><<<grid, block>>>(
3155  len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
3156  } else if (radial_distortion) {
3157  multiply_jx_noj_kernel<false, true><<<grid, block>>>(
3158  len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
3159  } else {
3160  multiply_jx_noj_kernel<false, false><<<grid, block>>>(
3161  len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
3162  }
3163 
3164  CheckErrorCUDA("ComputeJX_");
3165  } else if (mode == 1) {
3166  if (radial_distortion == -1) {
3167  PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3168  multiply_jcx_noj_kernel<true, false><<<grid, block>>>(
3169  len, (bw * bsize), jfc, (float2*)jx.data());
3170  } else if (radial_distortion) {
3171  multiply_jcx_noj_kernel<false, true><<<grid, block>>>(
3172  len, (bw * bsize), jfc, (float2*)jx.data());
3173  } else {
3174  multiply_jcx_noj_kernel<false, false><<<grid, block>>>(
3175  len, (bw * bsize), jfc, (float2*)jx.data());
3176  }
3177 
3178  CheckErrorCUDA("ComputeJCX_");
3179  } else if (mode == 2) {
3180  if (radial_distortion == 1) {
3181  multiply_jpx_noj_kernel<true><<<grid, block>>>(
3182  len, (bw * bsize), point_offset, (float2*)jx.data());
3183  } else {
3184  multiply_jpx_noj_kernel<false><<<grid, block>>>(
3185  len, (bw * bsize), point_offset, (float2*)jx.data());
3186  }
3187 
3188  CheckErrorCUDA("ComputeJX_");
3189  }
3190 }
3191 
3192 template <bool md, bool pd, int KH>
3193 __global__ void jte_cam_vec_noj_kernel(int num, int rowsz, float jic,
3194  float* jte) {
3195  __shared__ float value[KH * 32 * 9]; // 8 * KH * 32
3196  int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
3197  if (cam >= num) return;
3198 
3199  // read data range for this camera
3200  // 8 thread will do the same thing
3201  int idx1 = tex1Dfetch(tex_jte_cmp, cam); // first camera
3202  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1); // last camera + 1
3203 
3204  float* valuec = value + 32 * 9 * threadIdx.y;
3205  float* rp = valuec + threadIdx.x * 9;
3206  float rr[8], jxc[8], jyc[8];
3207  for (int i = 0; i < 8; ++i) rr[i] = 0;
3208 
3209  // loop to read the index of the projection.
3210  // so to get the location to read the jacobian
3211  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
3212  int index = tex1Dfetch(tex_jte_cmt, i);
3213  int2 proj = tex1Dfetch(tex_jacobian_idx, index);
3214  jacobian_camera_internal<md, pd>(cam << 2, proj.y, index, rp, jic, jxc,
3215  jyc);
3216  float2 vv = tex1Dfetch(tex_jte_pe, index);
3217  //
3218  for (int j = 0; j < 8; ++j) rr[j] += (jxc[j] * vv.x + jyc[j] * vv.y);
3219  }
3220 
3221  float* valuei = valuec + 8 * threadIdx.x;
3222  for (int i = 0; i < 8; ++i) valuei[i] = rr[i];
3223  valuec[threadIdx.x] = (valuec[threadIdx.x] + valuec[threadIdx.x + 32] +
3224  valuec[threadIdx.x + 64] + valuec[threadIdx.x + 96] +
3225  valuec[threadIdx.x + 128] + valuec[threadIdx.x + 160] +
3226  valuec[threadIdx.x + 192] + valuec[threadIdx.x + 224]);
3227  if (threadIdx.x < 16) valuec[threadIdx.x] += valuec[threadIdx.x + 16];
3228  if (threadIdx.x < 8)
3229  valuec[threadIdx.x] = valuec[threadIdx.x] + valuec[threadIdx.x + 8];
3230 
3231  ////////////////////////////////////
3232  if (threadIdx.x < 8) jte[(cam << 3) + threadIdx.x] = valuec[threadIdx.x];
3233 }
3234 
3235 template <bool pd, int KH>
3236 __global__ void jte_point_vec_noj_kernel(int num, int rowsz, float* jte) {
3237  ////////////////////////////
3238  __shared__ float value[KH * (9 * 32)];
3239  int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
3240  if (index >= num) return;
3241 
3242  int idx1 = tex1Dfetch(tex_jte_pmp, index); // first
3243  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last + 1
3244  float rx = 0, ry = 0, rz = 0, jxp[3], jyp[3];
3245  int rowp = threadIdx.y * 9 * 32;
3246  float* rp = value + threadIdx.x * 9 + rowp;
3247  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
3248  float2 ev = tex1Dfetch(tex_jte_pe, i);
3249  int2 proj = tex1Dfetch(tex_jacobian_idx, i);
3250  jacobian_point_internal<pd>(proj.x << 1, proj.y, i, rp, jxp, jyp);
3251  rx += (jxp[0] * ev.x + jyp[0] * ev.y);
3252  ry += (jxp[1] * ev.x + jyp[1] * ev.y);
3253  rz += (jxp[2] * ev.x + jyp[2] * ev.y);
3254  }
3255 
3256  int loc = (threadIdx.x << 2) + rowp;
3257  value[loc] = rx;
3258  value[loc + 1] = ry;
3259  value[loc + 2] = rz;
3260  value[loc + 3] = 0;
3261 
3262  int ridx = threadIdx.x + rowp;
3263  value[ridx] = ((value[ridx] + value[ridx + 32]) +
3264  (value[ridx + 64] + value[ridx + 96]));
3265  if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
3266  if (threadIdx.x < 8) value[ridx] += value[ridx + 8];
3267  if (threadIdx.x < 4)
3268  jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4];
3269 }
3270 
3271 void ProgramCU::ComputeJtE_(CuTexImage& e, CuTexImage& jte, CuTexImage& camera,
3272  CuTexImage& point, CuTexImage& meas,
3273  CuTexImage& cmap, CuTexImage& cmlist,
3274  CuTexImage& pmap, CuTexImage& pjmap, CuTexImage& jp,
3275  bool intrinsic_fixed, int radial_distortion,
3276  int mode) {
3277  PBA_BIND_TEX1D(tex_jacobian_idx, pjmap, cudaReadModeElementType, PBA_ChanInt2());
3278  PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3279  PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
3280  if (radial_distortion) PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3281 
3282  PBA_BIND_TEX1D(tex_jte_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
3283  PBA_BIND_TEX1D(tex_jte_cmt, cmlist, cudaReadModeElementType, PBA_ChanInt());
3284  PBA_BIND_TEX1D(tex_jte_pe, e, cudaReadModeElementType, PBA_ChanFloat2());
3285 
3286  //
3287  unsigned int bw, bh;
3288  float jfc = intrinsic_fixed ? 0 : 1.0f;
3289  int ncam = camera.GetImgWidth();
3290  const int bheight1 = 2, bsize = 32;
3291  int nblock1 = (ncam + bheight1 - 1) / bheight1;
3292  GetBlockConfiguration(nblock1, bw, bh);
3293  dim3 grid(bw, bh), block(bsize, bheight1);
3294  if (mode == 2) {
3295  } else if (radial_distortion == -1)
3296  jte_cam_vec_noj_kernel<true, false, bheight1><<<grid, block>>>(
3297  ncam, bw * bheight1, jfc, jte.data());
3298  else if (radial_distortion)
3299  jte_cam_vec_noj_kernel<false, true, bheight1><<<grid, block>>>(
3300  ncam, bw * bheight1, jfc, jte.data());
3301  else
3302  jte_cam_vec_noj_kernel<false, false, bheight1><<<grid, block>>>(
3303  ncam, bw * bheight1, jfc, jte.data());
3304  CheckErrorCUDA("ComputeJtE_<Camera>");
3305 
3306  int npt = point.GetImgWidth();
3307  unsigned int offsetv = 8 * ncam;
3308  const int bheight2 = 2, bsize2 = 32;
3309  int nblock2 = (npt + bheight2 - 1) / bheight2;
3310  GetBlockConfiguration(nblock2, bw, bh);
3311  dim3 grid2(bw, bh), block2(bsize2, bheight2);
3312  if (mode == 1) {
3313  } else if (jp.IsValid()) {
3314  PBA_BIND_TEX1D(tex_jte_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
3315  PBA_BIND_TEX1D(tex_jte_pex, e, cudaReadModeElementType, PBA_ChanFloat());
3316  PBA_BIND_TEX1D_2(tex_jte_jp, tex_jte_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
3317  if (jp.GetDataSize() > MAX_TEXSIZE)
3318  jte_point_vec_kernel<bheight2, 2><<<grid2, block2>>>(
3319  npt, bw * bheight2, jte.data() + offsetv);
3320  else
3321  jte_point_vec_kernel<bheight2, 1><<<grid2, block2>>>(
3322  npt, bw * bheight2, jte.data() + offsetv);
3323  } else {
3324  PBA_BIND_TEX1D(tex_jte_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
3325  if (radial_distortion && radial_distortion != -1)
3326  jte_point_vec_noj_kernel<true, bheight2><<<grid2, block2>>>(
3327  npt, bw * bheight2, jte.data() + offsetv);
3328  else
3329  jte_point_vec_noj_kernel<false, bheight2><<<grid2, block2>>>(
3330  npt, bw * bheight2, jte.data() + offsetv);
3331  }
3332  CheckErrorCUDA("ComputeJtE_<Point>");
3333 }
3334 
3335 template <int KH, bool md, bool pd, bool scaling>
3336 __global__ void jtjd_cam_block_noj_kernel(int num, int rowsz, float lambda1,
3337  float lambda2, float jic, float* diag,
3338  float* blocks,
3339  bool add_existing_diagc) {
3340  const int VN = (md || pd) ? 8 : 7;
3341  __shared__ float buffer_all[32 * 9 * KH];
3342  __shared__ float value_all[64 * KH];
3343 
3344  // 8thread per camera
3345  int bcam = blockIdx.x * KH + blockIdx.y * rowsz;
3346 
3347  int cam = bcam + threadIdx.y;
3348  if (cam >= num) return;
3349 
3350  float* buffer = buffer_all + threadIdx.y * (32 * 9);
3351  float* value = value_all + threadIdx.y * 64;
3352 
3353  float jxc[8], jyc[8];
3354  float* rp = buffer + threadIdx.x * 9;
3355  float row0[VN], row1[VN - 1], row2[VN - 2], row3[VN - 3];
3356  float row4[VN - 4], row5[VN - 5], row6[VN - 6], row7[1] = {0};
3357  // read data range for this camera
3358  // 8 thread will do the same thing
3359  int idx1 = tex1Dfetch(tex_jtjd_cmp, cam); // first camera
3360  int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1); // last camera + 1
3361 
3362 #define REPEAT7(FUNC) \
3363  FUNC(0); \
3364  FUNC(1); \
3365  FUNC(2); \
3366  FUNC(3); \
3367  FUNC(4); \
3368  FUNC(5); \
3369  FUNC(6);
3370 #define SETZERO(k) \
3371  for (int j = 0; j < VN - k; ++j) row##k[j] = 0;
3372  REPEAT7(SETZERO);
3373 
3374  float4 sjv[2];
3375  if (scaling && (pd || md)) {
3376  sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1));
3377  sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1);
3378  }
3379 
3380  // loop to read the index of the projection.
3381  // so to get the location to read the jacobian
3382  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
3383  /////////////////////////////////////////
3384  int index = tex1Dfetch(tex_jtjd_cmlist, i);
3385  int2 proj = tex1Dfetch(tex_jacobian_idx, index);
3386 
3387  ///////////////////////////////////////////////
3388  jacobian_camera_internal<md, pd>(cam << 2, proj.y, index, rp, jic, jxc,
3389  jyc);
3390 
3391  if (scaling && (pd || md)) {
3392  float* sj = (float*)sjv; // 32 threads...64 values
3393  for (int j = 0; j < VN; ++j) {
3394  jxc[j] *= sj[j];
3395  jyc[j] *= sj[j];
3396  }
3397  }
3398 
3399 ////////////////////////////////////////////////
3400 #define ADDROW(k) \
3401  for (int j = k; j < VN; ++j) \
3402  row##k[j - k] += (jxc[k] * jxc[j] + jyc[k] * jyc[j])
3403 
3404  ///////////////
3405  REPEAT7(ADDROW);
3406  if (VN == 8) {
3407  ADDROW(7);
3408  }
3409  }
3410 
3411 ////////////////////////////////////
3412 // make the matrix..//add up the 32 * 8 matrix
3413 #define JTJDSUM8_V1() \
3414  buffer[threadIdx.x] = \
3415  (buffer[threadIdx.x] + buffer[threadIdx.x + 32] + \
3416  buffer[threadIdx.x + 64] + buffer[threadIdx.x + 96] + \
3417  buffer[threadIdx.x + 128] + buffer[threadIdx.x + 160] + \
3418  buffer[threadIdx.x + 192] + buffer[threadIdx.x + 224]);
3419 
3420 #define JTJDSUM8_V2() \
3421  buffer[threadIdx.x] = \
3422  (((buffer[threadIdx.x] + buffer[threadIdx.x + 128]) + \
3423  (buffer[threadIdx.x + 64] + buffer[threadIdx.x + 192])) + \
3424  ((buffer[threadIdx.x + 32] + buffer[threadIdx.x + 160]) + \
3425  (buffer[threadIdx.x + 96] + buffer[threadIdx.x + 224])));
3426 
3427 #define STORE_ROWS(k) \
3428  for (int i = 0; i < (VN - k); ++i) bufi[i] = row##k[i]; \
3429  JTJDSUM8_V2(); \
3430  if (threadIdx.x < 16 - k) buffer[threadIdx.x] += buffer[threadIdx.x + 16]; \
3431  if (threadIdx.x < 8 - k) \
3432  value[threadIdx.x + k * 9] = buffer[threadIdx.x] + buffer[threadIdx.x + 8];
3433 
3434  float* bufi = buffer + threadIdx.x * 8;
3435  REPEAT7(STORE_ROWS);
3436  if (VN == 8) {
3437  STORE_ROWS(7);
3438  }
3439 
3440  /////////////////////////////////////////////////////////////////////////////////////////////
3441 
3442  //////////////////////////////// (8 * i + j) -> (8 * j + i)
3443  //#define COPYSYM(i) if(threadIdx.x < VN - i - 1) value[threadIdx.x * 8 + i *
3444  //9 + 8] = value[threadIdx.x + i * 9 + 1];
3445  if (threadIdx.x < VN - 1) value[threadIdx.x * 8 + 8] = value[threadIdx.x + 1];
3446  if (threadIdx.x < VN - 2)
3447  value[threadIdx.x * 8 + 17] = value[threadIdx.x + 10];
3448  if (threadIdx.x < VN - 3)
3449  value[threadIdx.x * 8 + 26] = value[threadIdx.x + 19];
3450  if (threadIdx.x < VN - 4)
3451  value[threadIdx.x * 8 + 35] = value[threadIdx.x + 28];
3452  if (threadIdx.x < VN - 5)
3453  value[threadIdx.x * 8 + 44] = value[threadIdx.x + 37];
3454  if (threadIdx.x < VN - 6)
3455  value[threadIdx.x * 8 + 53] = value[threadIdx.x + 46];
3456  if (VN == 8 && threadIdx.x < VN - 7)
3457  value[threadIdx.x * 8 + 62] = value[threadIdx.x + 55];
3458 
3459  if (scaling && !pd && !md) {
3460  float4 sjv[2];
3461  float* sj = (float*)sjv; // 32 threads...64 values
3462  sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1));
3463  sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1);
3464  float sji = sj[threadIdx.x & 0x07];
3465  value[threadIdx.x] *= (sji * sj[threadIdx.x / 8]);
3466  value[threadIdx.x + 32] *= (sji * sj[4 + threadIdx.x / 8]);
3467  }
3468 
3469  bool zero = ((threadIdx.x & 0x7) == VN);
3470 
3471  ///////////write back
3472  if (threadIdx.x < 8) {
3473  float* dp = value + threadIdx.x * 9;
3474  float temp = zero ? 0 : dp[0];
3475  int didx = threadIdx.x + (cam << 3);
3476  if (add_existing_diagc) temp += diag[didx];
3477  diag[didx] = temp;
3478  dp[0] = lambda1 + lambda2 * temp;
3479  }
3480  int wpos = cam * (8 * VN) + threadIdx.x;
3481  blocks[wpos] = zero ? 0 : value[threadIdx.x];
3482  if (threadIdx.x < VN * 8 - 32)
3483  blocks[wpos + 32] = zero ? 0 : value[threadIdx.x + 32];
3484 }
3485 
3486 template <int KW, bool pd, bool scaling>
3487 __global__ void jtjd_point_block_noj_kernel(int num, int rowsz, float lambda1,
3488  float lambda2, float4* diag,
3489  float4* blocks, int ptx) {
3490  ////////////////////////////
3491  int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
3492  if (index >= num) return;
3493 
3494  __shared__ float value[KW * 9];
3495  int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first
3496  int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last + 1
3497 
3498  float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
3499  float jxp[3], jyp[3];
3500  float* rp = value + threadIdx.x * 9;
3501 
3502  float4 sj;
3503  if (scaling && pd) sj = tex1Dfetch(tex_jacobian_sj, index + ptx);
3504 
3505  for (int i = idx1; i < idx2; ++i) {
3506  int2 proj = tex1Dfetch(tex_jacobian_idx, i);
3507  jacobian_point_internal<pd>(proj.x << 1, proj.y, i, rp, jxp, jyp);
3508 
3509  if (scaling && pd) {
3510  jxp[0] *= sj.x;
3511  jxp[1] *= sj.y;
3512  jxp[2] *= sj.z;
3513  jyp[0] *= sj.x;
3514  jyp[1] *= sj.y;
3515  jyp[2] *= sj.z;
3516  }
3517  M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
3518  M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
3519  M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
3520  M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
3521  M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
3522  M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
3523  }
3524 
3525  if (scaling && !pd) {
3526  sj = tex1Dfetch(tex_jacobian_sj, index + ptx);
3527  M00 *= (sj.x * sj.x);
3528  M01 *= (sj.x * sj.y);
3529  M02 *= (sj.x * sj.z);
3530  M11 *= (sj.y * sj.y);
3531  M12 *= (sj.y * sj.z);
3532  M22 *= (sj.z * sj.z);
3533  }
3534 
3535  diag[index] = make_float4(M00, M11, M22, 0);
3536 
3537  M00 = lambda2 * M00 + lambda1;
3538  M11 = lambda2 * M11 + lambda1;
3539  M22 = lambda2 * M22 + lambda1;
3540 
3541  // invert the 3x3 matrix.
3542  float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 -
3543  M02 * M02 * M11 - M12 * M12 * M00;
3544  if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
3545  int write_pos = index * 3;
3546  blocks[write_pos] = make_float4(0, 0, 0, 0);
3547  blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
3548  blocks[write_pos + 2] = make_float4(0, 0, 0, 0);
3549  } else {
3550  float m00 = (M11 * M22 - M12 * M12) / det;
3551  float m01 = -(M01 * M22 - M12 * M02) / det;
3552  float m02 = (M01 * M12 - M02 * M11) / det;
3553  int write_pos = index * 3;
3554  blocks[write_pos] = make_float4(m00, m01, m02, 0);
3555 
3556  float m11 = (M00 * M22 - M02 * M02) / det;
3557  float m12 = -(M00 * M12 - M01 * M02) / det;
3558  blocks[write_pos + 1] = make_float4(m01, m11, m12, 0);
3559 
3560  float m22 = (M00 * M11 - M01 * M01) / det;
3561  blocks[write_pos + 2] = make_float4(m02, m12, m22, 0);
3562  }
3563 }
3564 
3565 void ProgramCU::ComputeDiagonalBlock_(
3566  float lambda, bool dampd, CuTexImage& camera, CuTexImage& point,
3567  CuTexImage& meas, CuTexImage& cmap, CuTexImage& cmlist, CuTexImage& pmap,
3568  CuTexImage& jmap, CuTexImage& jp, CuTexImage& sj, CuTexImage& diag,
3569  CuTexImage& blocks, bool intrinsic_fixed, int radial_distortion,
3570  bool add_existing_diagc, int mode) {
3571  float lambda1 = dampd ? 0.0f : lambda;
3572  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
3573  float jfc = intrinsic_fixed ? 0.0f : 1.0f;
3574 
3575  //////////////////////////////////
3576  PBA_BIND_TEX1D(tex_jacobian_idx, jmap, cudaReadModeElementType, PBA_ChanInt2());
3577  PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3578  PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
3579  PBA_BIND_TEX1D(tex_jtjd_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
3580  PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
3581 
3582  ////////////////////////////////////////////////////
3583  const unsigned int bsize1 = 32;
3584  const unsigned int bheight1 = 2;
3585  unsigned int ncam = camera.GetImgWidth(); // how many cameras
3586  unsigned int nblock = (ncam + bheight1 - 1) / bheight1;
3587  unsigned int bw, bh;
3588  GetBlockConfiguration(nblock, bw, bh);
3589  dim3 block1(bsize1, bheight1), grid1(bw, bh);
3590 
3591  ///////////////////////////////////////////////////
3592  if (radial_distortion == -1) PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3593  if (mode == 2) {
3594  // skip the camera part.
3595  } else if (sj.IsValid()) {
3596  PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3597  if (radial_distortion == -1)
3598  jtjd_cam_block_noj_kernel<bheight1, true, false, true><<<grid1, block1>>>(
3599  ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3600  blocks.data(), add_existing_diagc);
3601  else if (radial_distortion)
3602  jtjd_cam_block_noj_kernel<bheight1, false, true, true><<<grid1, block1>>>(
3603  ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3604  blocks.data(), add_existing_diagc);
3605  else
3606  jtjd_cam_block_noj_kernel<bheight1, false, false,
3607  true><<<grid1, block1>>>(
3608  ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3609  blocks.data(), add_existing_diagc);
3610  } else {
3611  if (radial_distortion == -1)
3612  jtjd_cam_block_noj_kernel<bheight1, true, false,
3613  false><<<grid1, block1>>>(
3614  ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3615  blocks.data(), add_existing_diagc);
3616  else if (radial_distortion)
3617  jtjd_cam_block_noj_kernel<bheight1, false, true,
3618  false><<<grid1, block1>>>(
3619  ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3620  blocks.data(), add_existing_diagc);
3621  else
3622  jtjd_cam_block_noj_kernel<bheight1, false, false,
3623  false><<<grid1, block1>>>(
3624  ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3625  blocks.data(), add_existing_diagc);
3626  }
3627  CheckErrorCUDA("ComputeDiagonalBlock_<Camera>");
3628 
3629  ////////////////////////////////////////////////////
3630  const unsigned int bsize2 = 64;
3631  unsigned int npoint = point.GetImgWidth();
3632  unsigned int len2 = npoint;
3633  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
3634  unsigned int offsetd = 2 * ncam;
3635  unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam;
3636  GetBlockConfiguration(nblock2, bw, bh);
3637  dim3 grid2(bw, bh), block2(bsize2);
3638  PBA_BIND_TEX1D(tex_jtjd_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
3639 
3640  if (mode == 1) {
3641  } else if (jp.IsValid()) {
3642  PBA_BIND_TEX1D_2(tex_jtjd_jp, tex_jtjd_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
3643  if (jp.GetDataSize() > MAX_TEXSIZE)
3644  jtjd_point_block_kernel<2><<<grid2, block2>>>(
3645  len2, (bw * bsize2), lambda1, lambda2,
3646  ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb);
3647  else
3648  jtjd_point_block_kernel<1><<<grid2, block2>>>(
3649  len2, (bw * bsize2), lambda1, lambda2,
3650  ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb);
3651  } else {
3652  if (sj.IsValid()) {
3653  PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3654  if (radial_distortion && radial_distortion != -1)
3655  jtjd_point_block_noj_kernel<bsize2, true, true><<<grid2, block2>>>(
3656  len2, (bw * bsize2), lambda1, lambda2,
3657  ((float4*)diag.data()) + offsetd,
3658  ((float4*)blocks.data()) + offsetb, offsetd);
3659  else
3660  jtjd_point_block_noj_kernel<bsize2, false, true><<<grid2, block2>>>(
3661  len2, (bw * bsize2), lambda1, lambda2,
3662  ((float4*)diag.data()) + offsetd,
3663  ((float4*)blocks.data()) + offsetb, offsetd);
3664  } else {
3665  if (radial_distortion && radial_distortion != -1)
3666  jtjd_point_block_noj_kernel<bsize2, true, false><<<grid2, block2>>>(
3667  len2, (bw * bsize2), lambda1, lambda2,
3668  ((float4*)diag.data()) + offsetd,
3669  ((float4*)blocks.data()) + offsetb, 0);
3670  else
3671  jtjd_point_block_noj_kernel<bsize2, false, false><<<grid2, block2>>>(
3672  len2, (bw * bsize2), lambda1, lambda2,
3673  ((float4*)diag.data()) + offsetd,
3674  ((float4*)blocks.data()) + offsetb, 0);
3675  }
3676  }
3677  CheckErrorCUDA("ComputeDiagonalBlock_<Point>");
3678 
3679  ////////////////////////////////////////////////////
3680  if (mode != 2) {
3681  const unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH;
3682  unsigned int len3 = ncam * 8;
3683  unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3;
3684  dim3 grid3(nblock3), block3(bsize3);
3685  if (radial_distortion)
3686  jtjd_cam_block_invert_kernel<8><<<grid3, block3>>>(
3687  len3, (float4*)blocks.data());
3688  else
3689  jtjd_cam_block_invert_kernel<7><<<grid3, block3>>>(
3690  len3, (float4*)blocks.data());
3691  CheckErrorCUDA("ComputeDiagonalBlockInverse<Camera>");
3692  }
3693 }
3694 
3695 __global__ void projection_q_kernel(int nproj, int rowsz, float2* pj) {
3696  ////////////////////////////////
3697  int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
3698  if (tidx >= nproj) return;
3699  int2 proj = tex1Dfetch(tex_projection_idx, tidx);
3700  float2 wq = tex1Dfetch(tex_projection_mea, tidx);
3701  ///////////////////////////////////
3702  float f1 = tex1Dfetch(tex_projection_cam, proj.x * 4).x;
3703  float r1 = tex1Dfetch(tex_projection_cam, proj.x * 4 + 3).w;
3704  float f2 = tex1Dfetch(tex_projection_cam, proj.y * 4).x;
3705  float r2 = tex1Dfetch(tex_projection_cam, proj.y * 4 + 3).w;
3706  pj[tidx] = make_float2(-wq.x * (f1 - f2), -wq.y * (r1 - r2));
3707 }
3708 
3709 void ProgramCU::ComputeProjectionQ(CuTexImage& camera, CuTexImage& qmap,
3710  CuTexImage& qw, CuTexImage& proj,
3711  int offset) {
3712  ///////////////////////////////////////
3713  unsigned int len = qmap.GetImgWidth();
3714  unsigned int bsize = PROJECTION_FRT_KWIDTH;
3715  unsigned int nblock = (len + bsize - 1) / bsize;
3716  unsigned int bw, bh;
3717  GetBlockConfiguration(nblock, bw, bh);
3718  dim3 grid(bw, bh), block(bsize);
3719 
3720  ///////////////////////////////////////////
3721  PBA_BIND_TEX1D(tex_projection_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3722  PBA_BIND_TEX1D(tex_projection_idx, qmap, cudaReadModeElementType, PBA_ChanInt2());
3723  PBA_BIND_TEX1D(tex_projection_mea, qw, cudaReadModeElementType, PBA_ChanFloat2());
3724 
3725  //////////////////////////////
3726  projection_q_kernel<<<grid, block>>>(len, bw * bsize,
3727  ((float2*)proj.data()) + offset);
3728 }
3729 
3730 template <bool SJ>
3731 __global__ void multiply_jqx_kernel(int num, int bwidth, float2* result) {
3732  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3733  if (index >= num) return;
3734  ////////////////////////////////////////////
3735  int2 proj = tex1Dfetch(tex_mjx_idx, index);
3736  float2 wq = tex1Dfetch(tex_jacobian_meas, index);
3737  int idx1 = proj.x * 2, idx2 = proj.y * 2;
3738  float x11 = tex1Dfetch(tex_mjx_x, idx1).x;
3739  float x17 = tex1Dfetch(tex_mjx_x, idx1 + 1).w;
3740  float x21 = tex1Dfetch(tex_mjx_x, idx2).x;
3741  float x27 = tex1Dfetch(tex_mjx_x, idx2 + 1).w;
3742 
3743  if (SJ) {
3744  float s11 = tex1Dfetch(tex_jacobian_sj, idx1).x;
3745  float s17 = tex1Dfetch(tex_jacobian_sj, idx1 + 1).w;
3746  float s21 = tex1Dfetch(tex_jacobian_sj, idx2).x;
3747  float s27 = tex1Dfetch(tex_jacobian_sj, idx2 + 1).w;
3748  result[index] = make_float2((x11 * s11 - x21 * s21) * wq.x,
3749  (x17 * s17 - x27 * s27) * wq.y);
3750  } else {
3751  result[index] = make_float2((x11 - x21) * wq.x, (x17 - x27) * wq.y);
3752  }
3753 }
3754 
3755 void ProgramCU::ComputeJQX(CuTexImage& x, CuTexImage& qmap, CuTexImage& wq,
3756  CuTexImage& sj, CuTexImage& jx, int offset) {
3757  unsigned int nproj = qmap.GetImgWidth();
3758  unsigned int len = nproj;
3759  unsigned int bsize = 64;
3760  unsigned int nblock = (len + bsize - 1) / bsize;
3761  unsigned int bw, bh;
3762 
3763  /////////////////////////////
3764  PBA_BIND_TEX1D(tex_mjx_idx, qmap, cudaReadModeElementType, PBA_ChanInt2());
3765  PBA_BIND_TEX1D(tex_mjx_x, x, cudaReadModeElementType, PBA_ChanFloat4());
3766  PBA_BIND_TEX1D(tex_jacobian_meas, wq, cudaReadModeElementType, PBA_ChanFloat2());
3767 
3768  ///////////////////////////////////
3769  GetBlockConfiguration(nblock, bw, bh);
3770  dim3 grid(bw, bh), block(bsize);
3771 
3772  if (sj.IsValid()) {
3773  PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3774  multiply_jqx_kernel<true><<<grid, block>>>(len, (bw * bsize),
3775  ((float2*)jx.data()) + offset);
3776  } else {
3777  multiply_jqx_kernel<false><<<grid, block>>>(len, (bw * bsize),
3778  ((float2*)jx.data()) + offset);
3779  }
3780 }
3781 
3782 
3783 
3784 template <bool SJ>
3785 __global__ void jte_cam_q_kernel(int num, int bwidth, float* jte) {
3786  // int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz ;
3787  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3788  if (index >= num) return;
3789  int2 indexp = tex1Dfetch(tex_jte_q_idx, index);
3790  if (indexp.x == -1) return;
3791  float2 wq = tex1Dfetch(tex_jte_q_w, index);
3792  float2 e1 = tex1Dfetch(tex_jte_pe, indexp.x);
3793  float2 e2 = tex1Dfetch(tex_jte_pe, indexp.y);
3794  int index8 = index << 3;
3795  if (SJ) {
3796  float s1 = tex1Dfetch(tex_jacobian_sj, index * 2).x;
3797  jte[index8] += s1 * wq.x * (e1.x - e2.x);
3798  float s7 = tex1Dfetch(tex_jacobian_sj, index * 2 + 1).w;
3799  jte[index8 + 7] += s7 * wq.y * (e1.y - e2.y);
3800  } else {
3801  jte[index8] += wq.x * (e1.x - e2.x);
3802  jte[index8 + 7] += wq.y * (e1.y - e2.y);
3803  }
3804 }
3805 
3806 void ProgramCU::ComputeJQtEC(CuTexImage& pe, CuTexImage& qlist, CuTexImage& wq,
3807  CuTexImage& sj, CuTexImage& jte) {
3808  int ncam = qlist.GetImgWidth();
3809  const int bsize = 32;
3810  int nblock = (ncam + bsize - 1) / bsize;
3811  unsigned int bw, bh;
3812  GetBlockConfiguration(nblock, bw, bh);
3813  dim3 grid(bw, bh), block(bsize);
3814 
3815  PBA_BIND_TEX1D(tex_jte_pe, pe, cudaReadModeElementType, PBA_ChanFloat2());
3816  PBA_BIND_TEX1D(tex_jte_q_idx, qlist, cudaReadModeElementType, PBA_ChanInt2());
3817  PBA_BIND_TEX1D(tex_jte_q_w, wq, cudaReadModeElementType, PBA_ChanFloat2());
3818 
3819  if (sj.IsValid()) {
3820  PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3821  jte_cam_q_kernel<true><<<grid, block>>>(ncam, (bw * bsize), jte.data());
3822  } else {
3823  jte_cam_q_kernel<false><<<grid, block>>>(ncam, (bw * bsize), jte.data());
3824  }
3825 }
3826 
3827 } // namespace pba