1 ////////////////////////////////////////////////////////////////////////////
3 // Author: Changchang Wu
4 // Description : implementation of ProgramCU and all CUDA kernels
6 // Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu)
7 // and the University of Washington at Seattle
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU General Public
11 // License as published by the Free Software Foundation; either
12 // Version 3 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // General Public License for more details.
19 ////////////////////////////////////////////////////////////////////////////////
23 #include <cuda_runtime.h>
24 #include <unordered_map>
25 #include "CuTexImage.h"
26 #include "ProgramCU.h"
28 #define IMUL(X, Y) ((X) * (Y))
29 #define FDIV(X, Y) __fdividef(X, Y)
30 #define FDIV2(X, Y) ((X) / (Y))
31 #define MAX_BLOCKLEN 65535
32 #define MAX_BLOCKLEN_ALIGN 65504
33 #define MAX_TEXSIZE (1 << 29)
34 #define TEX_TOOBIG4(sz) (sz >> 31)
35 #define REDUCTION_NBLOCK 32
38 // Helpers to create texture descriptors and channel descriptors (host-side)
39 static inline cudaTextureDesc PBA_MakeTexDesc(cudaTextureReadMode read_mode) {
41 memset(&desc, 0, sizeof(desc));
42 desc.readMode = read_mode;
43 desc.addressMode[0] = cudaAddressModeClamp;
44 desc.addressMode[1] = cudaAddressModeClamp;
45 desc.addressMode[2] = cudaAddressModeClamp;
46 desc.filterMode = cudaFilterModePoint;
47 desc.normalizedCoords = false;
51 static inline cudaChannelFormatDesc PBA_ChanFloat() { return cudaCreateChannelDesc<float>(); }
52 static inline cudaChannelFormatDesc PBA_ChanFloat2() { return cudaCreateChannelDesc<float2>(); }
53 static inline cudaChannelFormatDesc PBA_ChanFloat4() { return cudaCreateChannelDesc<float4>(); }
54 static inline cudaChannelFormatDesc PBA_ChanInt() { return cudaCreateChannelDesc<int>(); }
55 static inline cudaChannelFormatDesc PBA_ChanInt2() { return cudaCreateChannelDesc<int2>(); }
57 // Texture object cache for large-scale binding to avoid excessive creation
64 bool operator==(const PBA_TexKey& other) const {
65 return device_id == other.device_id &&
66 dev_ptr == other.dev_ptr && size_bytes == other.size_bytes && read_mode == other.read_mode &&
67 x == other.x && y == other.y && z == other.z && w == other.w && f == other.f;
71 struct PBA_TexKeyHasher {
72 size_t operator()(const PBA_TexKey& k) const {
73 auto h = std::hash<int>()(k.device_id);
74 size_t addr = reinterpret_cast<size_t>(k.dev_ptr);
75 h ^= std::hash<size_t>()(addr + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2));
76 h ^= std::hash<size_t>()(k.size_bytes + 0x9e3779b97f4a7c15ULL + (h<<6) + (h>>2));
77 h ^= std::hash<int>()(k.read_mode + 0x9e3779b9 + (h<<6) + (h>>2));
78 h ^= std::hash<int>()(k.x + (h<<6) + (h>>2));
79 h ^= std::hash<int>()(k.y + (h<<6) + (h>>2));
80 h ^= std::hash<int>()(k.z + (h<<6) + (h>>2));
81 h ^= std::hash<int>()(k.w + (h<<6) + (h>>2));
82 h ^= std::hash<int>()(k.f + (h<<6) + (h>>2));
87 static std::unordered_map<PBA_TexKey, cudaTextureObject_t, PBA_TexKeyHasher>& PBA_GetTexCache() {
88 static std::unordered_map<PBA_TexKey, cudaTextureObject_t, PBA_TexKeyHasher> cache;
92 static cudaTextureObject_t PBA_AcquireTextureObject1D(CuTexImage& img,
93 const cudaTextureDesc& tex_desc,
94 const cudaChannelFormatDesc& ch_desc) {
95 const void* dev_ptr = img.data();
96 const size_t size_bytes = img.GetDataSize();
97 if (dev_ptr == nullptr || size_bytes == 0) return 0;
99 cudaGetDevice(&device_id);
100 PBA_TexKey key{device_id, dev_ptr, size_bytes, (int)tex_desc.readMode, ch_desc.x, ch_desc.y, ch_desc.z, ch_desc.w, ch_desc.f};
101 auto& cache = PBA_GetTexCache();
102 auto it = cache.find(key);
103 if (it != cache.end()) return it->second;
104 cudaResourceDesc res_desc{};
105 memset(&res_desc, 0, sizeof(res_desc));
106 res_desc.resType = cudaResourceTypeLinear;
107 res_desc.res.linear.devPtr = const_cast<void*>(dev_ptr);
108 res_desc.res.linear.desc = ch_desc;
109 res_desc.res.linear.sizeInBytes = size_bytes;
110 cudaTextureObject_t handle = 0;
111 cudaCreateTextureObject(&handle, &res_desc, &tex_desc, nullptr);
112 cache.emplace(key, handle);
116 static cudaTextureObject_t PBA_AcquireTextureObject1DRange(const void* base_dev_ptr,
119 const cudaTextureDesc& tex_desc,
120 const cudaChannelFormatDesc& ch_desc) {
121 if (base_dev_ptr == nullptr || size_bytes == 0) return 0;
123 cudaGetDevice(&device_id);
124 PBA_TexKey key{device_id, (const char*)base_dev_ptr + byte_offset, size_bytes,
125 (int)tex_desc.readMode, ch_desc.x, ch_desc.y, ch_desc.z, ch_desc.w, ch_desc.f};
126 auto& cache = PBA_GetTexCache();
127 auto it = cache.find(key);
128 if (it != cache.end()) return it->second;
129 cudaResourceDesc res_desc{};
130 memset(&res_desc, 0, sizeof(res_desc));
131 res_desc.resType = cudaResourceTypeLinear;
132 res_desc.res.linear.devPtr = (void*)((const char*)base_dev_ptr + byte_offset);
133 res_desc.res.linear.desc = ch_desc;
134 res_desc.res.linear.sizeInBytes = size_bytes;
135 cudaTextureObject_t handle = 0;
136 cudaCreateTextureObject(&handle, &res_desc, &tex_desc, nullptr);
137 cache.emplace(key, handle);
141 static void PBA_ClearTextureObjectCache() {
142 auto& cache = PBA_GetTexCache();
144 cudaGetDevice(&prev_device);
145 for (auto& kv : cache) {
146 // set device to the device that owns this texture object
147 int dev = kv.first.device_id;
148 if (dev != prev_device) cudaSetDevice(dev);
149 if (kv.second) cudaDestroyTextureObject(kv.second);
151 if (!cache.empty() && prev_device >= 0) cudaSetDevice(prev_device);
155 // Device-side texture object handles
156 __device__ cudaTextureObject_t tex_jacobian_cam;
157 __device__ cudaTextureObject_t tex_jacobian_pts;
158 __device__ cudaTextureObject_t tex_jacobian_idx;
159 __device__ cudaTextureObject_t tex_jacobian_meas;
160 __device__ cudaTextureObject_t tex_jacobian_sj;
161 __device__ cudaTextureObject_t tex_jacobian_shuffle;
163 __device__ cudaTextureObject_t tex_compact_cam;
164 __device__ cudaTextureObject_t tex_uncompressed_cam;
166 __device__ cudaTextureObject_t tex_update_cam;
167 __device__ cudaTextureObject_t tex_update_cam_delta;
169 __device__ cudaTextureObject_t tex_projection_cam;
170 __device__ cudaTextureObject_t tex_projection_idx;
171 __device__ cudaTextureObject_t tex_projection_pts;
172 __device__ cudaTextureObject_t tex_projection_mea;
174 __device__ cudaTextureObject_t tex_jte_pe;
175 __device__ cudaTextureObject_t tex_jte_pex;
176 __device__ cudaTextureObject_t tex_jte_jc;
177 __device__ cudaTextureObject_t tex_jte_jc2;
178 __device__ cudaTextureObject_t tex_jte_cmp;
179 __device__ cudaTextureObject_t tex_jte_cmt;
180 __device__ cudaTextureObject_t tex_jte_jc3;
181 __device__ cudaTextureObject_t tex_jte_jc4;
182 __device__ cudaTextureObject_t tex_jte_jp;
183 __device__ cudaTextureObject_t tex_jte_pmp;
184 __device__ cudaTextureObject_t tex_jte_jp2;
186 __device__ cudaTextureObject_t tex_jtjd_cmp;
187 __device__ cudaTextureObject_t tex_jtjd_cmlist;
188 __device__ cudaTextureObject_t tex_jtjd_jp;
189 __device__ cudaTextureObject_t tex_jtjd_pmp;
190 __device__ cudaTextureObject_t tex_jtjd_jp2;
192 __device__ cudaTextureObject_t tex_shuffle_jc;
193 __device__ cudaTextureObject_t tex_shuffle_map;
194 __device__ cudaTextureObject_t tex_shuffle_jc2;
196 __device__ cudaTextureObject_t tex_mjx_jc;
197 __device__ cudaTextureObject_t tex_mjx_jc2;
198 __device__ cudaTextureObject_t tex_mjx_jc3;
199 __device__ cudaTextureObject_t tex_mjx_jc4;
200 __device__ cudaTextureObject_t tex_mjx_jp;
201 __device__ cudaTextureObject_t tex_mjx_jp2;
202 __device__ cudaTextureObject_t tex_mjx_idx;
203 __device__ cudaTextureObject_t tex_mjx_x;
205 __device__ cudaTextureObject_t tex_jte_q_idx;
206 __device__ cudaTextureObject_t tex_jte_q_w;
208 // Macro to bind a CuTexImage to a device-side texture object symbol using cache
209 // Avoid frequent cudaMemcpyToSymbol if handle unchanged
210 #define PBA_SET_TEX_SYMBOL(sym, handle) \
211 do { static cudaTextureObject_t __last_##sym = 0; \
212 if (__last_##sym != (handle)) { cudaMemcpyToSymbol(sym, &(handle), sizeof(handle)); \
213 __last_##sym = (handle); } } while (0)
215 #define PBA_BIND_TEX1D(sym, img, read_mode, chdesc) \
217 auto __tex_desc = PBA_MakeTexDesc(read_mode); \
218 auto __chan_desc = chdesc; \
219 cudaTextureObject_t __h = PBA_AcquireTextureObject1D(img, __tex_desc, __chan_desc); \
220 PBA_SET_TEX_SYMBOL(sym, __h); \
223 // Bind CuTexImage as two/ four linear segments
224 #define PBA_BIND_TEX1D_2(sym1, sym2, img, read_mode, chdesc) \
226 auto __tex_desc = PBA_MakeTexDesc(read_mode); \
227 auto __chan_desc = chdesc; \
228 size_t __size = img.GetDataSize(); \
229 const void* __base = img.data(); \
230 size_t __elem_bits = (size_t)abs(__chan_desc.x) + abs(__chan_desc.y) + abs(__chan_desc.z) + abs(__chan_desc.w); \
231 size_t __elem_size = (__elem_bits >> 3); \
232 size_t __chunk = MAX_TEXSIZE - (MAX_TEXSIZE % (__elem_size ? __elem_size : 1)); \
233 size_t __sz0 = __size > __chunk ? __chunk : __size; \
234 cudaTextureObject_t __h1 = PBA_AcquireTextureObject1DRange(__base, 0, __sz0, __tex_desc, __chan_desc); \
235 PBA_SET_TEX_SYMBOL(sym1, __h1); \
236 if (__size > __chunk) { \
237 size_t __sz1 = __size - __chunk; \
238 cudaTextureObject_t __h2 = PBA_AcquireTextureObject1DRange(__base, __chunk, __sz1, __tex_desc, __chan_desc); \
239 PBA_SET_TEX_SYMBOL(sym2, __h2); \
243 #define PBA_BIND_TEX1D_4(sym1, sym2, sym3, sym4, img, read_mode, chdesc) \
245 auto __tex_desc = PBA_MakeTexDesc(read_mode); \
246 auto __chan_desc = chdesc; \
247 size_t __size = img.GetDataSize(); \
248 const void* __base = img.data(); \
249 size_t __elem_bits = (size_t)abs(__chan_desc.x) + abs(__chan_desc.y) + abs(__chan_desc.z) + abs(__chan_desc.w); \
250 size_t __elem_size = (__elem_bits >> 3); \
251 size_t __chunk = MAX_TEXSIZE - (MAX_TEXSIZE % (__elem_size ? __elem_size : 1)); \
253 size_t __rem = __size; \
254 cudaTextureObject_t __h1 = PBA_AcquireTextureObject1DRange(__base, __off, \
255 (__rem > __chunk ? __chunk : __rem), __tex_desc, __chan_desc); \
256 PBA_SET_TEX_SYMBOL(sym1, __h1); \
257 __off += (__rem > __chunk ? __chunk : __rem); \
258 __rem = (__rem > __chunk ? __rem - __chunk : 0); \
260 cudaTextureObject_t __h2 = PBA_AcquireTextureObject1DRange(__base, __off, \
261 (__rem > __chunk ? __chunk : __rem), __tex_desc, __chan_desc); \
262 PBA_SET_TEX_SYMBOL(sym2, __h2); \
263 __off += (__rem > __chunk ? __chunk : __rem); \
264 __rem = (__rem > __chunk ? __rem - __chunk : 0); \
267 cudaTextureObject_t __h3 = PBA_AcquireTextureObject1DRange(__base, __off, \
268 (__rem > __chunk ? __chunk : __rem), __tex_desc, __chan_desc); \
269 PBA_SET_TEX_SYMBOL(sym3, __h3); \
270 __off += (__rem > __chunk ? __chunk : __rem); \
271 __rem = (__rem > __chunk ? __rem - __chunk : 0); \
274 cudaTextureObject_t __h4 = PBA_AcquireTextureObject1DRange(__base, __off, __rem, \
275 __tex_desc, __chan_desc); \
276 PBA_SET_TEX_SYMBOL(sym4, __h4); \
280 // Expand untyped tex1Dfetch(tex_symbol, idx) into typed form via token pasting
281 #define tex1Dfetch(tex, idx) tex1Dfetch_##tex(idx)
282 #define tex1Dfetch_tex_jacobian_cam(i) tex1Dfetch<float4>(tex_jacobian_cam, i)
283 #define tex1Dfetch_tex_jacobian_pts(i) tex1Dfetch<float4>(tex_jacobian_pts, i)
284 #define tex1Dfetch_tex_jacobian_idx(i) tex1Dfetch<int2>(tex_jacobian_idx, i)
285 #define tex1Dfetch_tex_jacobian_meas(i) tex1Dfetch<float2>(tex_jacobian_meas, i)
286 #define tex1Dfetch_tex_jacobian_sj(i) tex1Dfetch<float4>(tex_jacobian_sj, i)
287 #define tex1Dfetch_tex_jacobian_shuffle(i) tex1Dfetch<int>(tex_jacobian_shuffle, i)
288 #define tex1Dfetch_tex_compact_cam(i) tex1Dfetch<float4>(tex_compact_cam, i)
289 #define tex1Dfetch_tex_uncompressed_cam(i) tex1Dfetch<float4>(tex_uncompressed_cam, i)
290 #define tex1Dfetch_tex_update_cam(i) tex1Dfetch<float4>(tex_update_cam, i)
291 #define tex1Dfetch_tex_update_cam_delta(i) tex1Dfetch<float4>(tex_update_cam_delta, i)
292 #define tex1Dfetch_tex_projection_cam(i) tex1Dfetch<float4>(tex_projection_cam, i)
293 #define tex1Dfetch_tex_projection_idx(i) tex1Dfetch<int2>(tex_projection_idx, i)
294 #define tex1Dfetch_tex_projection_pts(i) tex1Dfetch<float4>(tex_projection_pts, i)
295 #define tex1Dfetch_tex_projection_mea(i) tex1Dfetch<float2>(tex_projection_mea, i)
296 #define tex1Dfetch_tex_jte_pe(i) tex1Dfetch<float2>(tex_jte_pe, i)
297 #define tex1Dfetch_tex_jte_pex(i) tex1Dfetch<float>(tex_jte_pex, i)
298 #define tex1Dfetch_tex_jte_jc(i) tex1Dfetch<float4>(tex_jte_jc, i)
299 #define tex1Dfetch_tex_jte_jc2(i) tex1Dfetch<float4>(tex_jte_jc2, i)
300 #define tex1Dfetch_tex_jte_cmp(i) tex1Dfetch<int>(tex_jte_cmp, i)
301 #define tex1Dfetch_tex_jte_cmt(i) tex1Dfetch<int>(tex_jte_cmt, i)
302 #define tex1Dfetch_tex_jte_jc3(i) tex1Dfetch<float4>(tex_jte_jc3, i)
303 #define tex1Dfetch_tex_jte_jc4(i) tex1Dfetch<float4>(tex_jte_jc4, i)
304 #define tex1Dfetch_tex_jte_jp(i) tex1Dfetch<float4>(tex_jte_jp, i)
305 #define tex1Dfetch_tex_jte_pmp(i) tex1Dfetch<int>(tex_jte_pmp, i)
306 #define tex1Dfetch_tex_jte_jp2(i) tex1Dfetch<float4>(tex_jte_jp2, i)
307 #define tex1Dfetch_tex_jtjd_cmp(i) tex1Dfetch<int>(tex_jtjd_cmp, i)
308 #define tex1Dfetch_tex_jtjd_cmlist(i) tex1Dfetch<int>(tex_jtjd_cmlist, i)
309 #define tex1Dfetch_tex_jtjd_jp(i) tex1Dfetch<float4>(tex_jtjd_jp, i)
310 #define tex1Dfetch_tex_jtjd_pmp(i) tex1Dfetch<int>(tex_jtjd_pmp, i)
311 #define tex1Dfetch_tex_jtjd_jp2(i) tex1Dfetch<float4>(tex_jtjd_jp2, i)
312 #define tex1Dfetch_tex_shuffle_jc(i) tex1Dfetch<float4>(tex_shuffle_jc, i)
313 #define tex1Dfetch_tex_shuffle_map(i) tex1Dfetch<int>(tex_shuffle_map, i)
314 #define tex1Dfetch_tex_shuffle_jc2(i) tex1Dfetch<float4>(tex_shuffle_jc2, i)
315 #define tex1Dfetch_tex_mjx_jc(i) tex1Dfetch<float4>(tex_mjx_jc, i)
316 #define tex1Dfetch_tex_mjx_jc2(i) tex1Dfetch<float4>(tex_mjx_jc2, i)
317 #define tex1Dfetch_tex_mjx_jc3(i) tex1Dfetch<float4>(tex_mjx_jc3, i)
318 #define tex1Dfetch_tex_mjx_jc4(i) tex1Dfetch<float4>(tex_mjx_jc4, i)
319 #define tex1Dfetch_tex_mjx_jp(i) tex1Dfetch<float4>(tex_mjx_jp, i)
320 #define tex1Dfetch_tex_mjx_jp2(i) tex1Dfetch<float4>(tex_mjx_jp2, i)
321 #define tex1Dfetch_tex_mjx_idx(i) tex1Dfetch<int2>(tex_mjx_idx, i)
322 #define tex1Dfetch_tex_mjx_x(i) tex1Dfetch<float4>(tex_mjx_x, i)
323 #define tex1Dfetch_tex_jte_q_idx(i) tex1Dfetch<int2>(tex_jte_q_idx, i)
324 #define tex1Dfetch_tex_jte_q_w(i) tex1Dfetch<float2>(tex_jte_q_w, i)
326 void ProgramCU::FinishWorkCUDA() { cudaDeviceSynchronize(); }
328 int ProgramCU::CheckErrorCUDA(const char* location) {
329 cudaError_t e = cudaGetLastError();
331 if (location) fprintf(stderr, "%s:\t", location);
332 fprintf(stderr, "%s(%d)\n", cudaGetErrorString(e), e);
335 // fprintf(stderr, "%s:\n", location);
340 inline void ProgramCU::GetBlockConfiguration(unsigned int nblock,
343 if (nblock <= MAX_BLOCKLEN) {
347 bh = (nblock + MAX_BLOCKLEN_ALIGN - 1) / MAX_BLOCKLEN_ALIGN;
348 bw = (nblock + bh - 1) / bh;
349 bw = ((bw + 31) / 32) * 32;
350 bh = (nblock + bw - 1) / bw;
354 void ProgramCU::ClearPreviousError() { cudaGetLastError(); }
356 void ProgramCU::ResetCurrentDevice() {
358 cudaGetDevice(&device);
360 if (device > 0) cudaSetDevice(device);
363 void ProgramCU::ClearTextureObjectCache() { PBA_ClearTextureObjectCache(); }
365 size_t ProgramCU::GetCudaMemoryCap() {
367 if (cudaGetDevice(&device) != cudaSuccess) return 0;
369 if (cudaGetDeviceProperties(&prop, device) == cudaSuccess) {
370 if (prop.major == 9999 && prop.minor == 9999) return 0;
371 return prop.totalGlobalMem;
375 int ProgramCU::SetCudaDevice(int device) {
376 int count = 0, device_used;
377 if (cudaGetDeviceCount(&count) || count <= 0) {
378 ProgramCU::CheckErrorCUDA("CheckCudaDevice");
380 } else if (count == 1) {
381 cudaDeviceProp deviceProp;
382 if (cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess) {
383 fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
386 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
387 fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
392 if (device > 0 && device < count) {
393 cudaSetDevice(device);
394 CheckErrorCUDA("cudaSetDevice\n");
396 cudaGetDevice(&device_used);
397 if (device != device_used)
399 "ERROR: Cannot set device to %d\n"
400 "WARNING: Use device-%d instead (out of %d)\n",
401 device, device_used, count);
405 #define WARP_REDUCTION_32(value) \
407 if (threadIdx.x < 16) value[threadIdx.x] += value[threadIdx.x + 16]; \
408 if (threadIdx.x < 8) value[threadIdx.x] += value[threadIdx.x + 8]; \
409 if (threadIdx.x < 4) value[threadIdx.x] += value[threadIdx.x + 4]; \
410 if (threadIdx.x < 2) value[threadIdx.x] += value[threadIdx.x + 2];
412 #define WARP_REDUCTION_64(value) \
414 if (threadIdx.x < 32) value[threadIdx.x] += value[threadIdx.x + 32]; \
415 WARP_REDUCTION_32(value)
417 #define WARP_REDUCTION_128(value) \
419 if (threadIdx.x < 64) value[threadIdx.x] += value[threadIdx.x + 64]; \
420 WARP_REDUCTION_64(value)
422 #define WARP_REDUCTION_256(value) \
424 if (threadIdx.x < 128) value[threadIdx.x] += value[threadIdx.x + 128]; \
425 WARP_REDUCTION_128(value)
427 __global__ void vector_max_kernel(const float* x, int len, int blen,
429 __shared__ float value[256];
430 int bstart = blen * blockIdx.x;
431 int start = bstart + threadIdx.x;
432 int end = min(len, bstart + blen);
435 for (int i = start; i < end; i += blockDim.x) v = max(v, fabs(x[i]));
436 value[threadIdx.x] = v;
437 // reduce to the first two values
439 if (threadIdx.x < 128)
440 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 128]);
442 if (threadIdx.x < 64)
443 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 64]);
445 if (threadIdx.x < 32)
446 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 32]);
447 if (threadIdx.x < 16)
448 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 16]);
450 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 8]);
452 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 4]);
454 value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 2]);
456 if (threadIdx.x == 0) result[blockIdx.x] = max(value[0], value[1]);
459 float ProgramCU::ComputeVectorMax(CuTexImage& vector, CuTexImage& buf) {
460 const unsigned int nblock = 32;
461 const unsigned int bsize = 256;
462 int len = vector.GetLength();
463 int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
465 ////////////////////////////////
466 dim3 grid(nblock), block(bsize);
468 /////////////////////////////////
469 buf.InitTexture(nblock, 1);
470 vector_max_kernel<<<grid, block>>>(vector.data(), len, blen, buf.data());
471 ProgramCU::CheckErrorCUDA("ComputeVectorMax");
473 float data[nblock], result = 0;
474 buf.CopyToHost(data);
475 for (unsigned int i = 0; i < nblock; ++i) result = max(result, data[i]);
479 __global__ void vector_norm_kernel(const float* x, int len, int blen,
481 __shared__ float value[256];
482 int bstart = blen * blockIdx.x;
483 int start = bstart + threadIdx.x;
484 int end = min(len, bstart + blen);
487 for (int i = start; i < end; i += blockDim.x) {
491 value[threadIdx.x] = v;
492 // reduce to the first two values
493 WARP_REDUCTION_256(value);
496 if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
499 double ProgramCU::ComputeVectorNorm(CuTexImage& vector, CuTexImage& buf) {
500 const unsigned int nblock = REDUCTION_NBLOCK;
501 unsigned int bsize = 256;
502 int len = vector.GetLength();
503 int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
505 ////////////////////////////////
506 dim3 grid(nblock), block(bsize);
508 /////////////////////////////////
509 buf.InitTexture(nblock, 1);
510 vector_norm_kernel<<<grid, block>>>(vector.data(), len, blen, buf.data());
511 ProgramCU::CheckErrorCUDA("ComputeVectorNorm");
514 buf.CopyToHost(data);
516 for (unsigned int i = 0; i < nblock; ++i) result += data[i];
520 __global__ void vector_sum_kernel(const float* x, int len, int blen,
522 __shared__ float value[256];
523 int bstart = blen * blockIdx.x;
524 int start = bstart + threadIdx.x;
525 int end = min(len, bstart + blen);
527 for (int i = start; i < end; i += blockDim.x) v += x[i];
529 value[threadIdx.x] = v;
530 // reduce to the first two values
531 WARP_REDUCTION_256(value);
534 if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
537 float ProgramCU::ComputeVectorSum(CuTexImage& vector, CuTexImage& buf,
539 const unsigned int nblock = REDUCTION_NBLOCK;
540 unsigned int bsize = 256;
541 int len = vector.GetLength() - skip;
542 int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
544 ////////////////////////////////
545 dim3 grid(nblock), block(bsize);
547 /////////////////////////////////
548 buf.InitTexture(nblock, 1);
549 vector_sum_kernel<<<grid, block>>>((vector.data()) + skip, len, blen,
551 ProgramCU::CheckErrorCUDA("ComputeVectorSum");
554 buf.CopyToHost(data);
556 for (unsigned int i = 0; i < nblock; ++i) result += data[i];
557 return (float)result;
560 __global__ void vector_dotproduct_kernel(const float* a, const float* b,
561 int len, int blen, float* result) {
562 __shared__ float value[256];
563 int bstart = blen * blockIdx.x;
564 int start = bstart + threadIdx.x;
565 int end = min(len, bstart + blen);
568 for (int i = start; i < end; i += blockDim.x) v += (a[i] * b[i]);
569 value[threadIdx.x] = v;
571 // reduce to the first two values
572 WARP_REDUCTION_256(value);
575 if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
578 double ProgramCU::ComputeVectorDot(CuTexImage& vector1, CuTexImage& vector2,
580 const unsigned int nblock = REDUCTION_NBLOCK;
581 unsigned int bsize = 256;
582 int len = vector1.GetLength();
583 int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
585 ////////////////////////////////
586 dim3 grid(nblock), block(bsize);
588 /////////////////////////////////
589 buf.InitTexture(nblock, 1);
590 vector_dotproduct_kernel<<<grid, block>>>(vector1.data(), vector2.data(), len,
592 ProgramCU::CheckErrorCUDA("ComputeVectorDot");
595 buf.CopyToHost(data);
598 for (unsigned int i = 0; i < nblock; ++i) result += data[i];
602 __global__ void vector_weighted_norm_kernel(const float* vec, const float* w,
603 int len, int blen, float* result) {
604 __shared__ float value[256];
605 int bstart = blen * blockIdx.x;
606 int start = bstart + threadIdx.x;
607 int end = min(len, bstart + blen);
610 for (int i = start; i < end; i += blockDim.x) v += (vec[i] * w[i] * vec[i]);
611 value[threadIdx.x] = v;
613 // reduce to the first two values
614 WARP_REDUCTION_256(value);
617 if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
620 double ProgramCU::ComputeVectorNormW(CuTexImage& vector, CuTexImage& weight,
622 if (weight.IsValid()) {
623 const unsigned int nblock = REDUCTION_NBLOCK;
624 unsigned int bsize = 256;
625 int len = vector.GetLength();
626 int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
628 ////////////////////////////////
629 dim3 grid(nblock), block(bsize);
631 /////////////////////////////////
632 buf.InitTexture(nblock, 1);
634 vector_weighted_norm_kernel<<<grid, block>>>(vector.data(), weight.data(),
635 len, blen, buf.data());
637 ProgramCU::CheckErrorCUDA("ComputeVectorNormW");
640 buf.CopyToHost(data);
643 for (unsigned int i = 0; i < nblock; ++i) result += data[i];
646 return ComputeVectorNorm(vector, buf);
649 // given vector x, y, and a weight a
651 __global__ void saxpy_kernel(const float a, const float* x, const float* y,
652 float* result, unsigned int len) {
653 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
654 if (idx < len) result[idx] = a * x[idx] + y[idx];
657 __global__ void saxpy_kernel_large(const float a, const float* x,
658 const float* y, float* result,
659 unsigned int len, unsigned int rowsz) {
660 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
661 if (idx < len) result[idx] = a * x[idx] + y[idx];
664 void ProgramCU::ComputeSAXPY(float a, CuTexImage& texX, CuTexImage& texY,
665 CuTexImage& result) {
666 unsigned int len = result.GetLength();
667 unsigned int bsize = 128;
668 unsigned int nblock = (len + bsize - 1) / bsize;
669 if (nblock > MAX_BLOCKLEN) {
671 GetBlockConfiguration(nblock, bw, bh);
672 dim3 grid(bw, bh), block(bsize);
673 saxpy_kernel_large<<<grid, block>>>(a, texX.data(), texY.data(),
674 result.data(), len, bw * bsize);
676 dim3 grid(nblock), block(bsize);
677 saxpy_kernel<<<grid, block>>>(a, texX.data(), texY.data(), result.data(),
680 ProgramCU::CheckErrorCUDA("ComputeSAXPY");
683 __global__ void sxypz_kernel_large(float a, const float* x, const float* y,
684 const float* z, float* result,
685 unsigned int len, unsigned int rowsz) {
686 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
687 if (idx < len) result[idx] = a * x[idx] * y[idx] + z[idx];
690 void ProgramCU::ComputeSXYPZ(float a, CuTexImage& texX, CuTexImage& texY,
691 CuTexImage& texZ, CuTexImage& result) {
692 if (texX.IsValid()) {
693 unsigned int len = texX.GetLength();
694 unsigned int bsize = 128;
695 unsigned int nblock = (len + bsize - 1) / bsize;
697 GetBlockConfiguration(nblock, bw, bh);
698 dim3 grid(bw, bh), block(bsize);
699 sxypz_kernel_large<<<grid, block>>>(a, texX.data(), texY.data(),
700 texZ.data(), result.data(), len,
703 ComputeSAXPY(a, texY, texZ, result);
707 __global__ void vxy_kernel(const float* x, float* y, float* result,
709 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
710 if (idx < len) result[idx] = x[idx] * y[idx];
713 __global__ void vxy_kernel_large(const float* x, float* y, float* result,
714 unsigned int len, unsigned int rowsz) {
715 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + rowsz * blockIdx.y;
716 if (idx < len) result[idx] = x[idx] * y[idx];
719 void ProgramCU::ComputeVXY(CuTexImage& texX, CuTexImage& texY,
720 CuTexImage& result, unsigned int part,
722 unsigned int len = part ? part : texX.GetLength();
723 unsigned int bsize = 128;
724 unsigned int nblock = (len + bsize - 1) / bsize;
725 if (nblock > MAX_BLOCKLEN) {
727 GetBlockConfiguration(nblock, bw, bh);
728 dim3 grid(bw, bh), block(bsize);
729 vxy_kernel_large<<<grid, block>>>(texX.data() + skip, texY.data() + skip,
730 result.data() + skip, len, bsize * bw);
732 dim3 grid(nblock), block(bsize);
733 vxy_kernel<<<grid, block>>>(texX.data() + skip, texY.data() + skip,
734 result.data() + skip, len);
736 ProgramCU::CheckErrorCUDA("ComputeVXY");
739 __global__ void sqrt_kernel_large(float* x, unsigned int len,
740 unsigned int rowsz) {
741 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
742 if (idx < len) x[idx] = sqrt(x[idx]);
745 void ProgramCU::ComputeSQRT(CuTexImage& tex) {
746 unsigned int len = tex.GetLength();
747 unsigned int bsize = 128;
748 unsigned int nblock = (len + bsize - 1) / bsize;
750 GetBlockConfiguration(nblock, bw, bh);
751 dim3 grid(bw, bh), block(bsize);
752 sqrt_kernel_large<<<grid, block>>>(tex.data(), len, bw * bsize);
753 ProgramCU::CheckErrorCUDA("ComputeSQRT");
756 __global__ void rsqrt_kernel_large(float* x, unsigned int len,
757 unsigned int rowsz) {
758 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
759 if (idx < len) x[idx] = x[idx] > 0 ? rsqrt(x[idx]) : 0;
762 void ProgramCU::ComputeRSQRT(CuTexImage& tex) {
763 unsigned int len = tex.GetLength();
764 unsigned int bsize = 128;
765 unsigned int nblock = (len + bsize - 1) / bsize;
767 GetBlockConfiguration(nblock, bw, bh);
768 dim3 grid(bw, bh), block(bsize);
769 rsqrt_kernel_large<<<grid, block>>>(tex.data(), len, bw * bsize);
771 ProgramCU::CheckErrorCUDA("ComputeRSQRT");
774 __global__ void sax_kernel(const float a, const float* x, float* result,
776 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
777 if (idx < len) result[idx] = a * x[idx];
780 __global__ void sax_kernel_large(const float a, const float* x, float* result,
781 unsigned int len, unsigned int rowsz) {
782 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
783 if (idx < len) result[idx] = a * x[idx];
786 void ProgramCU::ComputeSAX(float a, CuTexImage& texX, CuTexImage& result) {
787 unsigned int len = texX.GetLength();
788 unsigned int bsize = 128;
789 unsigned int nblock = (len + bsize - 1) / bsize;
791 if (nblock > MAX_BLOCKLEN) {
793 GetBlockConfiguration(nblock, bw, bh);
794 dim3 grid(bw, bh), block(bsize);
795 sax_kernel_large<<<grid, block>>>(a, texX.data(), result.data(), len,
798 dim3 grid(nblock), block(bsize);
799 sax_kernel<<<grid, block>>>(a, texX.data(), result.data(), len);
801 ProgramCU::CheckErrorCUDA("ComputeSAX");
804 #define JACOBIAN_FRT_KWIDTH 64
808 #ifndef PBA_DISABLE_CONST_CAMERA
809 #define JACOBIAN_SET_JC_BEGIN if (r3.w == 0.0f) {
810 #define JFRT_SET_JC_END \
813 jc[jc_pos] = make_float4(0, 0, 0, 0); \
814 jc[jc_pos + 1] = make_float4(0, 0, 0, 0); \
815 jc[jc_pos + 2] = make_float4(0, 0, 0, 0); \
816 jc[jc_pos + 3] = make_float4(0, 0, 0, 0); \
818 #define JACOBIAN_SET_JC_END \
839 #define JACOBIAN_SET_JC_BEGIN
840 #define JFRT_SET_JC_END
841 #define JACOBIAN_SET_JC_END
844 // projection model ei = K(RX + T) - (1 + r * m^2) * m
845 template <bool md, bool pd, bool scaling, bool shuffle>
846 __global__ void jacobian_frt_kernel(float4* jc, float4* jp, int nproj, int ptx,
847 int rowsz, float jic) {
848 ////////////////////////////////
849 int tidx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
851 if (tidx >= nproj) return;
852 int2 proj = tex1Dfetch(tex_jacobian_idx, tidx);
853 int camera_pos = proj.x << 1;
855 __shared__ float rr_data[JACOBIAN_FRT_KWIDTH * 9];
856 float* r = rr_data + IMUL(9, threadIdx.x);
857 float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
858 float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
863 float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
868 float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
871 float4 temp = tex1Dfetch(tex_jacobian_pts, proj.y);
877 float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
878 float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
879 float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
880 float f_p2 = FDIV(ft.x, z0 + ft.w);
881 float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
882 float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
884 // dp/dx = [f/p2 0 -f*p0/p2/p2]
885 // [0 f/p2 -f*p1/p2/p2]
889 // R(dw) (x y z)' = (0 -z y)' dw0 + (z 0 -x)'dw1 + (-y x 0)'dw2
892 jc_pos = tex1Dfetch(tex_jacobian_shuffle, tidx) << 2;
898 float rr1 = r3.y * p0_p2 * p0_p2;
899 float rr2 = r3.y * p1_p2 * p1_p2;
900 float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
901 float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
902 if (scaling == false) {
904 JACOBIAN_SET_JC_BEGIN
905 // float jic = (r3.w != 1.0f && r3.w != 2.0f) ? 1.0f : 0.0f;
906 // float jec = (r3.w != 1.0f && r3.w != 3.0f) ? 1.0f : 0.0f;
907 float jfc = jic * (1 + rr1 + rr2);
908 float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
909 jc[jc_pos] = make_float4(p0_p2 * jfc, f_p2_x, 0, -f_p2_x * p0_p2);
911 make_float4(-f_p2_x * p0_p2 * y0, f_p2_x * (z0 + x0 * p0_p2),
912 -f_p2_x * y0, ft_x_pn * p0_p2);
913 jc[jc_pos + 2] = make_float4(p1_p2 * jfc, 0, f_p2_y, -f_p2 * p1_p2);
915 make_float4(-f_p2_y * (z0 + y0 * p1_p2), f_p2_y * x0 * p1_p2,
916 f_p2_y * x0, ft_x_pn * p1_p2);
920 jp[(tidx << 1)] = make_float4(f_p2_x * (r[0] - r[6] * p0_p2),
921 f_p2_x * (r[1] - r[7] * p0_p2),
922 f_p2_x * (r[2] - r[8] * p0_p2), 0);
923 jp[(tidx << 1) + 1] = make_float4(f_p2_y * (r[3] - r[6] * p1_p2),
924 f_p2_y * (r[4] - r[7] * p1_p2),
925 f_p2_y * (r[5] - r[8] * p1_p2), 0);
929 JACOBIAN_SET_JC_BEGIN
930 float jfc = jic * (1 + rr1 + rr2);
931 float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
932 float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
933 jc[jc_pos] = make_float4(p0_p2 * jfc * sc1.x, f_p2_x * sc1.y, 0,
934 -f_p2_x * p0_p2 * sc1.w);
935 jc[jc_pos + 2] = make_float4(p1_p2 * jfc * sc1.x, 0, f_p2_y * sc1.z,
936 -f_p2_y * p1_p2 * sc1.w);
938 float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
939 jc[jc_pos + 1] = make_float4(
940 -sc2.x * f_p2_x * p0_p2 * y0, sc2.y * f_p2_x * (z0 + x0 * p0_p2),
941 -sc2.z * f_p2_x * y0, ft_x_pn * p0_p2 * sc2.w);
942 jc[jc_pos + 3] = make_float4(
943 -sc2.x * f_p2_y * (z0 + y0 * p1_p2), sc2.y * f_p2_y * x0 * p1_p2,
944 sc2.z * f_p2_y * x0, ft_x_pn * p1_p2 * sc2.w);
948 float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
949 jp[(tidx << 1)] = make_float4(sc3.x * f_p2_x * (r[0] - r[6] * p0_p2),
950 sc3.y * f_p2_x * (r[1] - r[7] * p0_p2),
951 sc3.z * f_p2_x * (r[2] - r[8] * p0_p2), 0);
952 jp[(tidx << 1) + 1] =
953 make_float4(sc3.x * f_p2_y * (r[3] - r[6] * p1_p2),
954 sc3.y * f_p2_y * (r[4] - r[7] * p1_p2),
955 sc3.z * f_p2_y * (r[5] - r[8] * p1_p2), 0);
958 if (scaling == false) {
960 JACOBIAN_SET_JC_BEGIN
961 float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
962 float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
963 jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2);
965 make_float4(-f_p2 * p0_p2 * y0, f_p2 * (z0 + x0 * p0_p2),
966 -f_p2 * y0, -ms.x * msn);
967 jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2);
968 jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2),
969 f_p2 * x0 * p1_p2, f_p2 * x0, -ms.y * msn);
973 jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2),
974 f_p2 * (r[1] - r[7] * p0_p2),
975 f_p2 * (r[2] - r[8] * p0_p2), 0);
976 jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2),
977 f_p2 * (r[4] - r[7] * p1_p2),
978 f_p2 * (r[5] - r[8] * p1_p2), 0);
981 JACOBIAN_SET_JC_BEGIN
982 float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
983 jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0,
984 -f_p2 * p0_p2 * sc1.w);
985 jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z,
986 -f_p2 * p1_p2 * sc1.w);
988 float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
989 float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
990 float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
991 jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0,
992 sc2.y * f_p2 * (z0 + x0 * p0_p2),
993 -sc2.z * f_p2 * y0, -msn * ms.x * sc2.w);
994 jc[jc_pos + 3] = make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2),
995 sc2.y * f_p2 * x0 * p1_p2,
996 sc2.z * f_p2 * x0, -msn * ms.y * sc2.w);
999 float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
1000 jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2),
1001 sc3.y * f_p2 * (r[1] - r[7] * p0_p2),
1002 sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0);
1003 jp[(tidx << 1) + 1] =
1004 make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2),
1005 sc3.y * f_p2 * (r[4] - r[7] * p1_p2),
1006 sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0);
1010 if (scaling == false) {
1012 JACOBIAN_SET_JC_BEGIN
1013 jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2);
1014 jc[jc_pos + 1] = make_float4(-f_p2 * p0_p2 * y0,
1015 f_p2 * (z0 + x0 * p0_p2), -f_p2 * y0, 0);
1016 jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2);
1017 jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2),
1018 f_p2 * x0 * p1_p2, f_p2 * x0, 0);
1021 ////////////////////
1022 jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2),
1023 f_p2 * (r[1] - r[7] * p0_p2),
1024 f_p2 * (r[2] - r[8] * p0_p2), 0);
1025 jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2),
1026 f_p2 * (r[4] - r[7] * p1_p2),
1027 f_p2 * (r[5] - r[8] * p1_p2), 0);
1030 JACOBIAN_SET_JC_BEGIN
1031 float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
1032 jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0,
1033 -f_p2 * p0_p2 * sc1.w);
1034 jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z,
1035 -f_p2 * p1_p2 * sc1.w);
1036 float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
1037 jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0,
1038 sc2.y * f_p2 * (z0 + x0 * p0_p2),
1039 -sc2.z * f_p2 * y0, 0);
1041 make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2),
1042 sc2.y * f_p2 * x0 * p1_p2, sc2.z * f_p2 * x0, 0);
1046 float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
1047 jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2),
1048 sc3.y * f_p2 * (r[1] - r[7] * p0_p2),
1049 sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0);
1050 jp[(tidx << 1) + 1] =
1051 make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2),
1052 sc3.y * f_p2 * (r[4] - r[7] * p1_p2),
1053 sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0);
1058 /////////////////////////////////
1059 void ProgramCU::ComputeJacobian(CuTexImage& camera, CuTexImage& point,
1060 CuTexImage& jc, CuTexImage& jp,
1061 CuTexImage& proj_map, CuTexImage& sj,
1062 CuTexImage& meas, CuTexImage& cmlist,
1063 bool intrinsic_fixed, int radial_distortion,
1065 float jfc = intrinsic_fixed ? 0.0f : 1.0f;
1066 unsigned int len = proj_map.GetImgWidth();
1067 unsigned int bsize = JACOBIAN_FRT_KWIDTH;
1068 unsigned int nblock = (len + bsize - 1) / bsize;
1069 unsigned int bw, bh;
1070 GetBlockConfiguration(nblock, bw, bh);
1071 dim3 grid(bw, bh), block(bsize);
1073 PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1074 PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
1075 PBA_BIND_TEX1D(tex_jacobian_idx, proj_map, cudaReadModeElementType, PBA_ChanInt2());
1077 if (!jc.IsValid()) shuffle = false;
1078 if (shuffle) PBA_BIND_TEX1D(tex_jacobian_shuffle, cmlist, cudaReadModeElementType, PBA_ChanInt());
1079 if (sj.IsValid()) PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
1081 if (radial_distortion == -1) {
1082 PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
1085 jacobian_frt_kernel<true, false, true, true><<<grid, block>>>(
1086 (float4*)jc.data(), (float4*)jp.data(), len,
1087 camera.GetImgWidth() * 2, bw * bsize, jfc);
1089 jacobian_frt_kernel<true, false, true, false><<<grid, block>>>(
1090 (float4*)jc.data(), (float4*)jp.data(), len,
1091 camera.GetImgWidth() * 2, bw * bsize, jfc);
1094 jacobian_frt_kernel<true, false, false, true><<<grid, block>>>(
1095 (float4*)jc.data(), (float4*)jp.data(), len,
1096 camera.GetImgWidth() * 2, bw * bsize, jfc);
1098 jacobian_frt_kernel<true, false, false, false><<<grid, block>>>(
1099 (float4*)jc.data(), (float4*)jp.data(), len,
1100 camera.GetImgWidth() * 2, bw * bsize, jfc);
1102 } else if (radial_distortion) {
1105 jacobian_frt_kernel<false, true, true, true><<<grid, block>>>(
1106 (float4*)jc.data(), (float4*)jp.data(), len,
1107 camera.GetImgWidth() * 2, bw * bsize, jfc);
1109 jacobian_frt_kernel<false, true, true, false><<<grid, block>>>(
1110 (float4*)jc.data(), (float4*)jp.data(), len,
1111 camera.GetImgWidth() * 2, bw * bsize, jfc);
1114 jacobian_frt_kernel<false, true, false, true><<<grid, block>>>(
1115 (float4*)jc.data(), (float4*)jp.data(), len,
1116 camera.GetImgWidth() * 2, bw * bsize, jfc);
1118 jacobian_frt_kernel<false, true, false, false><<<grid, block>>>(
1119 (float4*)jc.data(), (float4*)jp.data(), len,
1120 camera.GetImgWidth() * 2, bw * bsize, jfc);
1125 jacobian_frt_kernel<false, false, true, true><<<grid, block>>>(
1126 (float4*)jc.data(), (float4*)jp.data(), len,
1127 camera.GetImgWidth() * 2, bw * bsize, jfc);
1129 jacobian_frt_kernel<false, false, true, false><<<grid, block>>>(
1130 (float4*)jc.data(), (float4*)jp.data(), len,
1131 camera.GetImgWidth() * 2, bw * bsize, jfc);
1134 jacobian_frt_kernel<false, false, false, true><<<grid, block>>>(
1135 (float4*)jc.data(), (float4*)jp.data(), len,
1136 camera.GetImgWidth() * 2, bw * bsize, jfc);
1138 jacobian_frt_kernel<false, false, false, false><<<grid, block>>>(
1139 (float4*)jc.data(), (float4*)jp.data(), len,
1140 camera.GetImgWidth() * 2, bw * bsize, jfc);
1144 ProgramCU::CheckErrorCUDA("ComputeJacobian");
1148 __global__ void uncompress_frt_kernel(int ncam, float4* ucam) {
1149 int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1150 if (tidx >= ncam) return;
1151 int fetch_index = tidx << 1;
1152 int write_index = IMUL(tidx, 4);
1153 float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index);
1154 ucam[write_index] = temp1;
1156 float4 temp2 = tex1Dfetch(tex_compact_cam, fetch_index + 1);
1160 float rx_rx = rx * rx;
1161 float ry_ry = ry * ry;
1162 float rz_rz = rz * rz;
1163 float aa = sqrt(rx_rx + ry_ry + rz_rz);
1165 sincosf(aa, &saa, &caa);
1166 float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa);
1167 float st = aa == 0.0 ? 1 : FDIV2(saa, aa);
1168 float rz_st = rz * st;
1169 float rx_st = rx * st;
1170 float ry_st = ry * st;
1171 float ry_ry_ct = ry_ry * ct;
1172 float rx_rx_ct = rx_rx * ct;
1173 float rz_rz_ct = rz_rz * ct;
1174 float rx_ry_ct = rx * ry * ct;
1175 float rz_rx_ct = rz * rx * ct;
1176 float ry_rz_ct = ry * rz * ct;
1178 ////////////////////////////////////////////////////////////
1179 ucam[write_index + 1] =
1180 make_float4((1.0 - (ry_ry_ct + rz_rz_ct)), (rx_ry_ct - rz_st),
1181 (rz_rx_ct + ry_st), (rx_ry_ct + rz_st));
1183 ucam[write_index + 2] =
1184 make_float4((1.0 - (rz_rz_ct + rx_rx_ct)), (ry_rz_ct - rx_st),
1185 (rz_rx_ct - ry_st), (ry_rz_ct + rx_st));
1187 ucam[write_index + 3] =
1188 make_float4((1.0 - (rx_rx_ct + ry_ry_ct)), temp2.w, 0, 0);
1191 void ProgramCU::UncompressCamera(int ncam, CuTexImage& camera,
1192 CuTexImage& result) {
1193 unsigned int len = ncam;
1194 unsigned int bsize = 64;
1195 unsigned int nblock = (len + bsize - 1) / bsize;
1198 PBA_BIND_TEX1D(tex_compact_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1199 uncompress_frt_kernel<<<grid, block>>>(len, (float4*)result.data());
1200 CheckErrorCUDA("UncompressCamera");
1205 __global__ void compress_frt_kernel(int ncam, float4* zcam) {
1206 int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1207 if (tidx >= ncam) return;
1208 int fetch_index = tidx << 2;
1209 int write_index = tidx << 1;
1210 float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index);
1211 zcam[write_index] = temp1;
1213 float4 r1 = tex1Dfetch(tex_compact_cam, fetch_index + 1);
1214 float4 r2 = tex1Dfetch(tex_compact_cam, fetch_index + 2);
1215 float4 r3 = tex1Dfetch(tex_compact_cam, fetch_index + 3);
1217 float a = (r1.x + r2.x + r3.x - 1.0) / 2.0;
1219 zcam[write_index + 1] = make_float4(0, 0, 0, 0);
1221 float aa = acos(a), b = 0.5 * aa * rsqrt(1 - a * a);
1222 zcam[write_index + 1] = make_float4(b * (r2.w - r2.y), b * (r1.z - r2.z),
1223 b * (r1.w - r1.y), r3.y);
1227 void ProgramCU::CompressCamera(int ncam, CuTexImage& camera0,
1228 CuTexImage& result) {
1229 unsigned int len = ncam;
1230 unsigned int bsize = 64;
1231 unsigned int nblock = (len + bsize - 1) / bsize;
1232 dim3 grid(nblock), block(bsize);
1233 PBA_BIND_TEX1D(tex_uncompressed_cam, camera0, cudaReadModeElementType, PBA_ChanFloat4());
1234 compress_frt_kernel<<<grid, block>>>(ncam, (float4*)result.data());
1235 CheckErrorCUDA("CompressCamera");
1238 __device__ inline void uncompress_rodrigues_rotation(float rx, float ry,
1239 float rz, float* r) {
1240 float rx_rx = rx * rx;
1241 float ry_ry = ry * ry;
1242 float rz_rz = rz * rz;
1243 float aa = sqrt(rx_rx + ry_ry + rz_rz);
1245 sincosf(aa, &saa, &caa);
1246 float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa);
1247 float st = aa == 0.0 ? 1 : FDIV2(saa, aa);
1248 float rz_st = rz * st;
1249 float rx_st = rx * st;
1250 float ry_st = ry * st;
1251 float ry_ry_ct = ry_ry * ct;
1252 float rx_rx_ct = rx_rx * ct;
1253 float rz_rz_ct = rz_rz * ct;
1254 float rx_ry_ct = rx * ry * ct;
1255 float rz_rx_ct = rz * rx * ct;
1256 float ry_rz_ct = ry * rz * ct;
1257 r[0] = (1.0 - (ry_ry_ct + rz_rz_ct));
1258 r[1] = (rx_ry_ct - rz_st);
1259 r[2] = (rz_rx_ct + ry_st);
1260 r[3] = (rx_ry_ct + rz_st);
1261 r[4] = (1.0 - (rz_rz_ct + rx_rx_ct));
1262 r[5] = (ry_rz_ct - rx_st);
1263 r[6] = (rz_rx_ct - ry_st);
1264 r[7] = (ry_rz_ct + rx_st);
1265 r[8] = (1.0 - (rx_rx_ct + ry_ry_ct));
1270 __global__ void update_camera_kernel(int ncam, float4* newcam) {
1271 int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1272 if (tidx >= ncam) return;
1273 int index0 = tidx << 2;
1274 int index1 = tidx << 1;
1276 float4 c1 = tex1Dfetch(tex_update_cam, index0);
1277 float4 d1 = tex1Dfetch(tex_update_cam_delta, index1);
1278 float4 c2 = make_float4(max(c1.x + d1.x, 1e-10f), c1.y + d1.y, c1.z + d1.z,
1280 newcam[index0] = c2;
1283 float r[9], dr[9]; //, nr[9];
1284 float4 r1 = tex1Dfetch(tex_update_cam, index0 + 1);
1289 float4 r2 = tex1Dfetch(tex_update_cam, index0 + 2);
1294 float4 r3 = tex1Dfetch(tex_update_cam, index0 + 3);
1297 float4 dd = tex1Dfetch(tex_update_cam_delta, index1 + 1);
1298 uncompress_rodrigues_rotation(dd.x, dd.y, dd.z, dr);
1300 ///////////////////////////////////////////////
1301 newcam[index0 + 1] =
1302 make_float4(dr[0] * r[0] + dr[1] * r[3] + dr[2] * r[6],
1303 dr[0] * r[1] + dr[1] * r[4] + dr[2] * r[7],
1304 dr[0] * r[2] + dr[1] * r[5] + dr[2] * r[8],
1305 dr[3] * r[0] + dr[4] * r[3] + dr[5] * r[6]);
1306 newcam[index0 + 2] =
1307 make_float4(dr[3] * r[1] + dr[4] * r[4] + dr[5] * r[7],
1308 dr[3] * r[2] + dr[4] * r[5] + dr[5] * r[8],
1309 dr[6] * r[0] + dr[7] * r[3] + dr[8] * r[6],
1310 dr[6] * r[1] + dr[7] * r[4] + dr[8] * r[7]);
1311 newcam[index0 + 3] = make_float4(dr[6] * r[2] + dr[7] * r[5] + dr[8] * r[8],
1312 r3.y + dd.w, r3.z, r3.w);
1316 void ProgramCU::UpdateCameraPoint(int ncam, CuTexImage& camera,
1317 CuTexImage& point, CuTexImage& delta,
1318 CuTexImage& new_camera, CuTexImage& new_point,
1321 unsigned int len = ncam;
1322 unsigned int bsize = 64;
1323 unsigned int nblock = (len + bsize - 1) / bsize;
1324 dim3 grid(nblock), block(bsize);
1325 PBA_BIND_TEX1D(tex_update_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1326 PBA_BIND_TEX1D(tex_update_cam_delta, delta, cudaReadModeElementType, PBA_ChanFloat4());
1327 update_camera_kernel<<<grid, block>>>(len, (float4*)new_camera.data());
1328 CheckErrorCUDA("UpdateCamera");
1331 // update the points
1334 dp.SetTexture(delta.data() + 8 * ncam, point.GetLength());
1335 ComputeSAXPY(1.0f, dp, point, new_point);
1336 CheckErrorCUDA("UpdatePoint");
1340 #define PROJECTION_FRT_KWIDTH 64
1344 // run 32/64/128 projections in a block
1345 template <bool md, bool pd>
1346 __global__ void projection_frt_kernel(int nproj, int rowsz, float2* pj) {
1347 ////////////////////////////////
1348 int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
1349 if (tidx >= nproj) return;
1350 float f, m[3], t[3]; // r[9],
1351 __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9];
1352 float* r = rr_data + IMUL(9, threadIdx.x);
1353 int2 proj = tex1Dfetch(tex_projection_idx, tidx);
1354 int cpos = proj.x << 1;
1355 float4 ft = tex1Dfetch(tex_projection_cam, cpos);
1360 float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1);
1365 float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2);
1370 float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3);
1373 float4 temp = tex1Dfetch(tex_projection_pts, proj.y);
1378 float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0];
1379 float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1];
1380 float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2];
1383 float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2);
1384 float f_p2 = FDIV2(f * rr, p2);
1385 float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1386 pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
1388 float f_p2 = FDIV2(f, p2);
1389 float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1390 float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y);
1391 pj[tidx] = make_float2(ms.x * rd - p0 * f_p2, ms.y * rd - p1 * f_p2);
1393 float f_p2 = FDIV2(f, p2);
1394 float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1395 pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
1399 void ProgramCU::ComputeProjection(CuTexImage& camera, CuTexImage& point,
1400 CuTexImage& meas, CuTexImage& proj_map,
1401 CuTexImage& proj, int radial) {
1402 unsigned int len = proj_map.GetImgWidth();
1403 unsigned int bsize = PROJECTION_FRT_KWIDTH;
1404 unsigned int nblock = (len + bsize - 1) / bsize;
1405 PBA_BIND_TEX1D(tex_projection_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1406 PBA_BIND_TEX1D(tex_projection_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
1407 PBA_BIND_TEX1D(tex_projection_idx, proj_map, cudaReadModeElementType, PBA_ChanInt2());
1408 unsigned int bw, bh;
1409 GetBlockConfiguration(nblock, bw, bh);
1410 dim3 grid(bw, bh), block(bsize);
1411 PBA_BIND_TEX1D(tex_projection_mea, meas, cudaReadModeElementType, PBA_ChanFloat2());
1413 projection_frt_kernel<true, false><<<grid, block>>>(len, bw * bsize,
1414 (float2*)proj.data());
1416 projection_frt_kernel<false, true><<<grid, block>>>(len, bw * bsize,
1417 (float2*)proj.data());
1419 projection_frt_kernel<false, false><<<grid, block>>>(len, bw * bsize,
1420 (float2*)proj.data());
1421 CheckErrorCUDA("ComputeProjection");
1424 template <bool md, bool pd>
1425 __global__ void projectionx_frt_kernel(int nproj, int rowsz, float2* pj) {
1426 ////////////////////////////////
1427 int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
1428 if (tidx >= nproj) return;
1429 float f, m[3], t[3]; // r[9],
1430 __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9];
1431 float* r = rr_data + IMUL(9, threadIdx.x);
1432 int2 proj = tex1Dfetch(tex_projection_idx, tidx);
1433 int cpos = proj.x << 1;
1434 float4 ft = tex1Dfetch(tex_projection_cam, cpos);
1439 float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1);
1444 float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2);
1449 float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3);
1452 float4 temp = tex1Dfetch(tex_projection_pts, proj.y);
1457 float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0];
1458 float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1];
1459 float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2];
1461 float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2);
1462 float f_p2 = FDIV2(f, p2);
1463 float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1464 pj[tidx] = make_float2(ms.x / rr - p0 * f_p2, ms.y / rr - p1 * f_p2);
1466 float f_p2 = FDIV2(f, p2);
1467 float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1468 float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y);
1469 pj[tidx] = make_float2(ms.x - p0 * f_p2 / rd, ms.y - p1 * f_p2 / rd);
1471 float f_p2 = FDIV2(f, p2);
1472 float2 ms = tex1Dfetch(tex_projection_mea, tidx);
1473 pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
1477 void ProgramCU::ComputeProjectionX(CuTexImage& camera, CuTexImage& point,
1478 CuTexImage& meas, CuTexImage& proj_map,
1479 CuTexImage& proj, int radial) {
1480 unsigned int len = proj_map.GetImgWidth();
1481 unsigned int bsize = PROJECTION_FRT_KWIDTH;
1482 unsigned int nblock = (len + bsize - 1) / bsize;
1483 PBA_BIND_TEX1D(tex_projection_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
1484 PBA_BIND_TEX1D(tex_projection_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
1485 PBA_BIND_TEX1D(tex_projection_idx, proj_map, cudaReadModeElementType, PBA_ChanInt2());
1486 unsigned int bw, bh;
1487 GetBlockConfiguration(nblock, bw, bh);
1488 dim3 grid(bw, bh), block(bsize);
1489 PBA_BIND_TEX1D(tex_projection_mea, meas, cudaReadModeElementType, PBA_ChanFloat2());
1491 projectionx_frt_kernel<true, false><<<grid, block>>>(len, bw * bsize,
1492 (float2*)proj.data());
1494 projectionx_frt_kernel<false, true><<<grid, block>>>(len, bw * bsize,
1495 (float2*)proj.data());
1497 projectionx_frt_kernel<false, false><<<grid, block>>>(len, bw * bsize,
1498 (float2*)proj.data());
1499 CheckErrorCUDA("ComputeProjection");
1504 __global__ void jte_cam_kernel(int num, float* jc, float* jte) {
1505 __shared__ float value[128];
1507 // 8thread per camera
1508 int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1509 if (col >= num) return;
1511 int cam = col >> 4; // 8 thread per camera
1513 // read data range for this camera, 8 thread will do the same thing
1514 int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4; // first camera
1515 int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4; // last camera + 1
1517 ///////////////////////////////
1518 int offset = threadIdx.x & 0xf; // which parameter of this camera
1519 int part = offset >= 8 ? 1 : 0;
1520 /////////////////////////////
1523 // loop to read the index of the projection.
1524 // so to get the location to read the jacobian
1525 for (int i = idx1 + offset; i < idx2; i += 16) {
1527 // every 8 thread will read the same position.
1528 int index = tex1Dfetch(tex_jte_cmt, i >> 4);
1529 float v = tex1Dfetch(tex_jte_pex, (index << 1) + part);
1530 //////////////////////
1533 value[threadIdx.x] = result;
1535 if (offset < 8) jte[(cam << 3) + offset] = (result + value[threadIdx.x + 8]);
1538 template <int KH, int TEXN>
1539 __global__ void jte_cam_vec_kernel(int num, float* jte) {
1540 __shared__ float value[KH * 128];
1541 int cam = blockIdx.x * KH + threadIdx.y;
1542 if (cam >= num) return;
1544 // read data range for this camera
1545 // 8 thread will do the same thing
1546 int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 2; // first camera
1547 int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 2; // last camera + 1
1548 int part = (threadIdx.x & 0x02) ? 1 : 0;
1550 float rx = 0, ry = 0, rz = 0, rw = 0;
1551 // loop to read the index of the projection.
1552 // so to get the location to read the jacobian
1553 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1556 temp = tex1Dfetch(tex_jte_jc, i);
1559 int texid = i >> 25;
1561 temp = tex1Dfetch(tex_jte_jc, i);
1563 temp = tex1Dfetch(tex_jte_jc2, (i & 0x1ffffff));
1566 int index = tex1Dfetch(tex_jte_cmt, i >> 2);
1567 int iii = (index << 2) + (i & 0x3);
1568 int texid = iii >> 25;
1569 /////////////////////////////////
1571 temp = tex1Dfetch(tex_jte_jc, iii);
1572 else if (texid == 1)
1573 temp = tex1Dfetch(tex_jte_jc2, (iii & 0x1ffffff));
1574 else if (texid == 2)
1575 temp = tex1Dfetch(tex_jte_jc3, (iii & 0x1ffffff));
1577 temp = tex1Dfetch(tex_jte_jc4, (iii & 0x1ffffff));
1579 int index = tex1Dfetch(tex_jte_cmt, i >> 2);
1580 float vv = tex1Dfetch(tex_jte_pex, (index << 1) + part);
1586 ////////////////////////////////////
1587 int widx = (threadIdx.y << 7) + (threadIdx.x << 2);
1588 ///////////////////////////////////
1591 value[widx + 1] = ry;
1592 value[widx + 2] = rz;
1593 value[widx + 3] = rw;
1594 ////////////////////////////////////
1595 int ridx = (threadIdx.y << 7) + threadIdx.x;
1596 value[ridx] = ((value[ridx] + value[ridx + 32]) +
1597 (value[ridx + 64] + value[ridx + 96]));
1598 if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
1599 if (threadIdx.x < 8)
1600 jte[(cam << 3) + threadIdx.x] = value[ridx] + value[ridx + 8];
1603 template <int KH, bool JT>
1604 __global__ void jte_cam_vec32_kernel(int num, float* jc, float* jte) {
1605 __shared__ float value[KH * 32];
1606 int cam = blockIdx.x * KH + threadIdx.y;
1607 if (cam >= num) return;
1609 int rowpos = (threadIdx.y << 5);
1610 int index = threadIdx.x + rowpos;
1611 int xypart = (threadIdx.x & 0x08) ? 1 : 0;
1612 int part2 = threadIdx.x & 0xf;
1613 // read data range for this camera
1614 // 8 thread will do the same thing
1615 int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4; // first camera
1616 int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4; // last camera + 1
1618 // loop to read the index of the projection.
1619 // so to get the location to read the jacobian
1620 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1621 int index = tex1Dfetch(tex_jte_cmt, i >> 4);
1626 temp = jc[(index << 4) + part2];
1628 float v = tex1Dfetch(tex_jte_pex, (index << 1) + xypart);
1633 if (threadIdx.x < 16) value[index] += value[index + 16];
1634 if (threadIdx.x < 8)
1635 jte[(cam << 3) + threadIdx.x] = value[index] + value[index + 8];
1638 /////////////////////////////////////////////////////////////
1641 __global__ void jte_point_kernel(int num, float4* jte) {
1642 ////////////////////////////
1643 int index = blockIdx.x * blockDim.x + threadIdx.x;
1644 if (index >= num) return;
1646 int idx1 = tex1Dfetch(tex_jte_pmp, index); // first camera
1647 int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last camera + 1
1648 float4 result = make_float4(0, 0, 0, 0);
1649 for (int i = idx1; i < idx2; ++i) {
1651 float2 ev = tex1Dfetch(tex_jte_pe, i);
1653 float4 j1 = tex1Dfetch(tex_jte_jp, i << 1);
1654 result.x += j1.x * ev.x;
1655 result.y += j1.y * ev.x;
1656 result.z += j1.z * ev.x;
1658 float4 j2 = tex1Dfetch(tex_jte_jp, 1 + (i << 1));
1659 result.x += j2.x * ev.y;
1660 result.y += j2.y * ev.y;
1661 result.z += j2.z * ev.y;
1663 jte[index] = result;
1666 ////////////////////
1667 // faster but not always more accurate
1668 //#define JTE_POINT_VEC2
1670 template <int KH, int TEXN>
1671 __global__ void jte_point_vec_kernel(int num, int rowsz, float* jte) {
1672 ////////////////////////////
1673 __shared__ float value[KH * 128];
1674 int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
1675 if (index >= num) return;
1676 #ifdef JTE_POINT_VEC2
1677 int idx1 = tex1Dfetch(tex_jte_pmp, index); // first
1678 int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last + 1
1680 int idx1 = tex1Dfetch(tex_jte_pmp, index) << 1; // first
1681 int idx2 = tex1Dfetch(tex_jte_pmp, index + 1) << 1; // last + 1
1683 float rx = 0, ry = 0, rz = 0;
1684 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1685 if (TEXN == 2 && i >> 25) {
1686 #ifdef JTE_POINT_VEC2
1688 float2 vv = tex1Dfetch(tex_jte_pe, i);
1689 float4 jp1 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1));
1690 float4 jp2 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1) + 1);
1691 rx += (jp1.x * vv.x + jp2.x * vv.y);
1692 ry += (jp1.y * vv.x + jp2.y * vv.y);
1693 rz += (jp1.z * vv.x + jp2.z * vv.y);
1695 float vv = tex1Dfetch(tex_jte_pex, i);
1696 float4 jpi = tex1Dfetch(tex_jte_jp2, i & 0x1ffffff);
1702 #ifdef JTE_POINT_VEC2
1703 float2 vv = tex1Dfetch(tex_jte_pe, i);
1704 float4 jp1 = tex1Dfetch(tex_jte_jp, (i << 1));
1705 float4 jp2 = tex1Dfetch(tex_jte_jp, (i << 1) + 1);
1706 rx += (jp1.x * vv.x + jp2.x * vv.y);
1707 ry += (jp1.y * vv.x + jp2.y * vv.y);
1708 rz += (jp1.z * vv.x + jp2.z * vv.y);
1710 float vv = tex1Dfetch(tex_jte_pex, i);
1711 float4 jpi = tex1Dfetch(tex_jte_jp, i);
1719 int rowp = threadIdx.y << 7;
1720 int loc = (threadIdx.x << 2) + rowp;
1722 value[loc + 1] = ry;
1723 value[loc + 2] = rz;
1726 int ridx = threadIdx.x + rowp;
1727 value[ridx] = ((value[ridx] + value[ridx + 32]) +
1728 (value[ridx + 64] + value[ridx + 96]));
1729 if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
1730 if (threadIdx.x < 8) value[ridx] += value[ridx + 8];
1731 if (threadIdx.x < 4)
1732 jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4];
1735 #define JTE_CAMERA_VEC
1736 #define JTE_POINT_VEC
1738 void ProgramCU::ComputeJtE(CuTexImage& pe, CuTexImage& jc, CuTexImage& cmap,
1739 CuTexImage& cmlist, CuTexImage& jp, CuTexImage& pmap,
1740 CuTexImage& jte, bool jc_transpose, int mode) {
1741 //////////////////////////////////////////////////////////
1742 int ncam = int(cmap.GetImgWidth() - 1); // how many cameras
1743 size_t szjc = jc.GetDataSize();
1745 //////////////////////////////
1746 PBA_BIND_TEX1D(tex_jte_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
1747 PBA_BIND_TEX1D(tex_jte_cmt, cmlist, cudaReadModeElementType, PBA_ChanInt());
1748 #ifdef JTE_CAMERA_VEC2
1749 PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1750 const unsigned int bheight = 2;
1751 dim3 block1(32, bheight), grid1((ncam + bheight - 1) / bheight);
1753 } else if (jc_transpose)
1754 jte_cam_vec32_kernel<bheight, true><<<grid1, block1>>>(ncam, jc.data(),
1757 jte_cam_vec32_kernel<bheight, false><<<grid1, block1>>>(ncam, jc.data(),
1760 #elif defined(JTE_CAMERA_VEC)
1761 PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1762 const unsigned int bheight = 2;
1763 unsigned int len1 = ncam * 32;
1764 unsigned int bsize1 = 32 * bheight;
1765 unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
1766 dim3 grid1(nblock1);
1767 dim3 block1(32, bheight);
1770 } else if (szjc > 2 * MAX_TEXSIZE || !jc_transpose) {
1772 jte_cam_vec32_kernel<bheight, true><<<grid1, block1>>>(ncam, jc.data(),
1775 jte_cam_vec32_kernel<bheight, false><<<grid1, block1>>>(ncam, jc.data(),
1777 } else if (szjc > MAX_TEXSIZE) {
1778 PBA_BIND_TEX1D_2(tex_jte_jc, tex_jte_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
1779 jte_cam_vec_kernel<bheight, 2><<<grid1, block1>>>(ncam, jte.data());
1781 PBA_BIND_TEX1D(tex_jte_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
1782 jte_cam_vec_kernel<bheight, 1><<<grid1, block1>>>(ncam, jte.data());
1785 PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1786 unsigned int len1 = ncam * 16;
1787 unsigned int bsize1 = len1 > 32 * 128 ? 128 : (len1 > 32 * 64 ? 64 : 32);
1788 unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
1789 dim3 grid1(nblock1), block1(bsize1);
1790 jte_cam_kernel<<<grid1, block1>>>(len1, jc.data(), jte.data());
1792 CheckErrorCUDA("ComputeJtE<Camera>");
1794 ////////////////////////////////////////////
1795 PBA_BIND_TEX1D(tex_jte_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
1796 unsigned int npoint = (pmap.GetImgWidth() - 1);
1797 #ifndef JTE_POINT_VEC
1798 size_t len2 = npoint;
1799 unsigned int bsize2 = 64;
1800 unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
1801 dim3 grid2(nblock2), block2(bsize2);
1802 PBA_BIND_TEX1D(tex_jte_pe, pe, cudaReadModeElementType, PBA_ChanFloat2());
1803 PBA_BIND_TEX1D(tex_jte_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
1804 jte_point_kernel<<<grid2, block2>>>(len2, ((float4*)jte.data()) + 2 * ncam);
1807 #ifdef JTE_POINT_VEC2
1808 PBA_BIND_TEX1D(tex_jte_pe, pe, cudaReadModeElementType, PBA_ChanFloat2());
1810 PBA_BIND_TEX1D(tex_jte_pex, pe, cudaReadModeElementType, PBA_ChanFloat());
1812 const unsigned int bheight2 = 2;
1813 unsigned int bsize2 = 32;
1814 unsigned int nblock2 = (unsigned int)((npoint + bheight2 - 1) / bheight2);
1815 unsigned int offsetv = 8 * ncam;
1816 unsigned int bw, bh;
1817 GetBlockConfiguration(nblock2, bw, bh);
1818 dim3 grid2(bw, bh), block2(bsize2, bheight2);
1821 } else if (jp.GetDataSize() > MAX_TEXSIZE) {
1822 PBA_BIND_TEX1D_2(tex_jte_jp, tex_jte_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
1823 jte_point_vec_kernel<bheight2, 2><<<grid2, block2>>>(
1824 npoint, bw * bheight2, ((float*)jte.data()) + offsetv);
1826 PBA_BIND_TEX1D(tex_jte_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
1827 jte_point_vec_kernel<bheight2, 1><<<grid2, block2>>>(
1828 npoint, bw * bheight2, ((float*)jte.data()) + offsetv);
1831 CheckErrorCUDA("ComputeJtE<Point>");
1836 template <int VN, int KH, bool JT>
1837 __global__ void jtjd_cam_vec32_kernel(int num, int add_existing_dq, float* jc,
1838 float* jtjd, float* jtjdi) {
1839 __shared__ float value[KH * 32];
1841 // 8thread per camera
1842 int cam = blockIdx.x * KH + threadIdx.y;
1843 int part = threadIdx.x & 0x7; // which parameter of this camera
1844 int part2 = threadIdx.x & 0xf;
1845 int campos = threadIdx.y << 5;
1846 int index = threadIdx.x + campos;
1848 if (cam < num && part < VN) {
1849 // read data range for this camera
1850 // 8 thread will do the same thing
1851 int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4; // first camera
1852 int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4; // last camera + 1
1854 // loop to read the index of the projection.
1855 // so to get the location to read the jacobian
1856 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
1861 int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4;
1862 float temp = jc[ii + part2];
1869 if (cam >= num) return;
1870 // save all the results?
1872 if (threadIdx.x < 16) value[index] += value[index + 16];
1873 if (threadIdx.x < 8)
1876 if (threadIdx.x < 8) {
1877 float temp = value[index] + value[index + 8];
1878 int wpos = threadIdx.x + (cam << 3);
1879 if (add_existing_dq) temp += jtjd[wpos];
1881 jtjdi[wpos] = temp == 0 ? 0 : 1 / (temp);
1887 #define JTJD_POINT_KWIDTH 64
1890 __global__ void jtjd_point_kernel(int num, int rowsz, float4* jtjd,
1892 ////////////////////////////
1893 int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
1894 if (index >= num) return;
1896 int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first camera
1897 int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last camera + 1
1898 float rx = 0, ry = 0, rz = 0;
1899 for (int i = idx1; i < idx2; ++i) {
1900 if (TEXN == 2 && i > 0xffffff) {
1901 float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1);
1906 float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1));
1911 float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1);
1916 float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1));
1923 if (jtjd) jtjd[index] = make_float4(rx, ry, rz, 0.0f);
1924 jtjdi[index] = make_float4(1.0f / rx, 1.0f / ry, 1.0f / rz, 0.0f);
1927 void ProgramCU::ComputeDiagonal(CuTexImage& jc, CuTexImage& cmap,
1928 CuTexImage& jp, CuTexImage& pmap,
1929 CuTexImage& cmlist, CuTexImage& jtjd,
1930 CuTexImage& jtjdi, bool jc_transpose,
1931 int radial, bool add_existing_diagc) {
1932 //////////////////////////////////////////////////////////
1933 size_t szjc = jc.GetDataSize();
1934 unsigned int ncam = (cmap.GetImgWidth() - 1); // how many cameras
1936 const unsigned int bheight = 2;
1937 dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight);
1938 PBA_BIND_TEX1D(tex_jtjd_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
1941 jtjd_cam_vec32_kernel<8, bheight, true><<<grid1x, block1x>>>(
1942 ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1944 jtjd_cam_vec32_kernel<7, bheight, true><<<grid1x, block1x>>>(
1945 ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1947 PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
1949 jtjd_cam_vec32_kernel<8, bheight, false><<<grid1x, block1x>>>(
1950 ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1952 jtjd_cam_vec32_kernel<7, bheight, false><<<grid1x, block1x>>>(
1953 ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
1955 CheckErrorCUDA("ComputeDiagonal<Camera>");
1957 ////////////////////////////////////////////
1958 unsigned int npoint = (pmap.GetImgWidth() - 1);
1959 unsigned int len2 = npoint;
1960 unsigned int bsize2 = JTJD_POINT_KWIDTH;
1961 unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
1962 unsigned int bw, bh;
1963 GetBlockConfiguration(nblock2, bw, bh);
1964 dim3 grid2(bw, bh), block2(bsize2);
1965 PBA_BIND_TEX1D(tex_jtjd_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
1967 if (jp.GetDataSize() > MAX_TEXSIZE) {
1968 PBA_BIND_TEX1D_2(tex_jtjd_jp, tex_jtjd_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
1969 jtjd_point_kernel<2><<<grid2, block2>>>(len2, (bw * bsize2),
1970 ((float4*)jtjd.data()) + 2 * ncam,
1971 ((float4*)jtjdi.data()) + 2 * ncam);
1973 PBA_BIND_TEX1D(tex_jtjd_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
1974 jtjd_point_kernel<1><<<grid2, block2>>>(len2, (bw * bsize2),
1975 ((float4*)jtjd.data()) + 2 * ncam,
1976 ((float4*)jtjdi.data()) + 2 * ncam);
1978 CheckErrorCUDA("ComputeDiagonal<Point>");
1983 __global__ void jtjd_cam_q_kernel(int num, int rowsz, float* qw, float4* diag) {
1984 int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y;
1985 int index = bindex + threadIdx.x;
1986 if (index >= num) return;
1987 int tid = index & 0x1;
1988 float w = qw[index], ws = w * w * 2.0f;
1990 float4 sj = tex1Dfetch(tex_jacobian_sj, index);
1991 float4 dj = tid == 0 ? make_float4(sj.x * sj.x * ws, 0, 0, 0)
1992 : make_float4(0, 0, 0, sj.w * sj.w * ws);
1995 float4 dj = tid == 0 ? make_float4(ws, 0, 0, 0) : make_float4(0, 0, 0, ws);
2000 void ProgramCU::ComputeDiagonalQ(CuTexImage& qlistw, CuTexImage& sj,
2002 unsigned int bsize = 32;
2003 unsigned int len = qlistw.GetImgWidth() * 2;
2004 unsigned int nblock = (len + bsize - 1) / bsize;
2005 unsigned int bw, bh;
2006 GetBlockConfiguration(nblock, bw, bh);
2007 dim3 grid(bw, bh), block(bsize);
2009 PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
2010 jtjd_cam_q_kernel<true><<<grid, block>>>(len, (bw * bsize), qlistw.data(),
2011 (float4*)diag.data());
2013 jtjd_cam_q_kernel<false><<<grid, block>>>(len, (bw * bsize), qlistw.data(),
2014 (float4*)diag.data());
2016 CheckErrorCUDA("ComputeDiagonalQ");
2019 template <int VN, int KH, bool JT>
2020 __global__ void jtjd_cam_block_vec32_kernel(int num, float lambda1,
2021 float lambda2, float* jc,
2022 float* diag, float* blocks,
2023 bool add_existing_diagc) {
2024 __shared__ float value[KH * 32 * VN];
2026 // 8thread per camera
2027 int cam = blockIdx.x * KH + threadIdx.y;
2028 int part = threadIdx.x & 0x7; // which parameter of this camera
2029 int part2 = threadIdx.x & 0xf;
2030 int index = threadIdx.x + (threadIdx.y << 5);
2031 float row[8] = {0, 0, 0, 0, 0, 0, 0, 0};
2033 int rowpos = index - part;
2034 // read data range for this camera
2035 // 8 thread will do the same thing
2036 int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4; // first camera
2037 int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4; // last camera + 1
2039 // loop to read the index of the projection.
2040 // so to get the location to read the jacobian
2041 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
2044 value[index] = temp;
2045 for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]);
2047 int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4;
2048 float temp = jc[ii + part2];
2049 value[index] = temp;
2050 for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]);
2056 if (cam >= num) return;
2057 // save all the results?
2058 for (int i = 0; i < VN; ++i) value[index * VN + i] = row[i];
2059 int campos = threadIdx.y * (32 * VN);
2060 for (int i = threadIdx.x; i < (VN * 16); i += 32)
2061 value[campos + i] += value[campos + i + (16 * VN)];
2062 for (int i = threadIdx.x; i < (VN * 8); i += 32)
2063 value[campos + i] += value[campos + i + (8 * VN)];
2066 bool zero = (part >= VN);
2069 if (threadIdx.x < 8) {
2070 float* dp = value + campos + threadIdx.x * (VN + 1);
2071 float temp = zero ? 0 : dp[0];
2072 int didx = threadIdx.x + (cam << 3);
2073 if (add_existing_diagc) temp += diag[didx];
2075 dp[0] = lambda1 + lambda2 * temp;
2077 int wpos = cam * (8 * VN) + threadIdx.x;
2078 int rpos = campos + threadIdx.x - (threadIdx.x >> 3);
2079 blocks[wpos] = zero ? 0 : value[rpos];
2080 if (threadIdx.x < (VN * 8 - 32))
2081 blocks[wpos + 32] = zero ? 0 : value[rpos + 28];
2084 if (threadIdx.x < 8) {
2085 float* dp = value + campos + threadIdx.x * (VN + 1);
2087 int didx = threadIdx.x + (cam << 3);
2088 if (add_existing_diagc) temp += diag[didx];
2090 dp[0] = lambda1 + lambda2 * temp; // max(, 1e-6) ;
2092 int wpos = cam * (8 * VN) + threadIdx.x;
2093 int rpos = campos + threadIdx.x;
2094 blocks[wpos] = value[rpos];
2095 blocks[wpos + 32] = value[rpos + 32];
2099 #define JTJD_POINT_BLOCK_KWIDTH 64
2102 __global__ void jtjd_point_block_kernel(int num, int rowsz, float lambda1,
2103 float lambda2, float4* diag,
2105 ////////////////////////////
2106 int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
2107 if (index >= num) return;
2109 int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first camera
2110 int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last camera + 1
2112 float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
2113 for (int i = idx1; i < idx2; ++i) {
2114 if (TEXN == 2 && i > 0xffffff) {
2115 float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1);
2123 float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1));
2131 float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1);
2139 float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1));
2149 diag[index] = make_float4(M00, M11, M22, 0);
2151 M00 = lambda2 * M00 + lambda1;
2152 M11 = lambda2 * M11 + lambda1;
2153 M22 = lambda2 * M22 + lambda1;
2155 // invert the 3x3 matrix.
2156 float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 -
2157 M02 * M02 * M11 - M12 * M12 * M00;
2158 if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
2159 int write_pos = index * 3;
2160 blocks[write_pos] = make_float4(0, 0, 0, 0);
2161 blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
2162 blocks[write_pos + 2] = make_float4(0, 0, 0, 0);
2164 float m00 = (M11 * M22 - M12 * M12) / det;
2165 float m01 = -(M01 * M22 - M12 * M02) / det;
2166 float m02 = (M01 * M12 - M02 * M11) / det;
2167 int write_pos = index * 3;
2168 blocks[write_pos] = make_float4(m00, m01, m02, 0);
2170 float m11 = (M00 * M22 - M02 * M02) / det;
2171 float m12 = -(M00 * M12 - M01 * M02) / det;
2172 blocks[write_pos + 1] = make_float4(m01, m11, m12, 0);
2174 float m22 = (M00 * M11 - M01 * M01) / det;
2175 blocks[write_pos + 2] = make_float4(m02, m12, m22, 0);
2179 #define JTJD_BLOCK_CAM_INVERT_KWIDTH 64
2181 __global__ void jtjd_cam_block_invert_kernel(int num, float4* blocks) {
2182 // N / 8 cameras...each have 64 floats,,,, N * 8 float
2183 // each will read 8 float......
2184 __shared__ float value[JTJD_BLOCK_CAM_INVERT_KWIDTH * VN];
2185 __shared__ bool invalid[JTJD_BLOCK_CAM_INVERT_KWIDTH / 8];
2186 //////////////////////////////////////////////
2188 int bindex = IMUL(blockIdx.x, blockDim.x);
2189 int index = bindex + threadIdx.x;
2190 int block_read_pos = IMUL(bindex, VN);
2191 for (int i = 0; i < JTJD_BLOCK_CAM_INVERT_KWIDTH * VN;
2192 i += JTJD_BLOCK_CAM_INVERT_KWIDTH)
2193 value[threadIdx.x + i] = ((float*)blocks)[block_read_pos + threadIdx.x + i];
2195 const int cam_id = threadIdx.x >> 3;
2196 const int cam_pos = IMUL(cam_id, VN * 8);
2197 const int col = threadIdx.x & 0x7, rowj_pos = col << 3;
2200 float* a = value + cam_pos;
2201 for (int i = 0; i < VN; ++i) {
2202 int rowi_pos = i << 3, dpos = i + rowi_pos;
2203 if (col == i && a[dpos] > 0) a[dpos] = rsqrt(a[dpos]);
2205 float diag = a[dpos];
2206 if (diag == 0 || col >= VN) continue;
2208 a[rowi_pos + col] = 0;
2209 } else if (col > i) {
2210 float aij = a[rowi_pos + col] * diag;
2211 a[rowi_pos + col] = aij;
2212 for (int k = col; k < VN; ++k) a[rowj_pos + k] -= a[rowi_pos + k] * aij;
2216 if (index >= num) return;
2218 if (col == 0) invalid[cam_id] = false;
2220 for (int i = 1; i < VN; ++i) {
2221 int rowi_pos = i << 3, dpos = i + rowi_pos;
2222 if (a[dpos] == 0) continue;
2225 for (int k = col; k < i; ++k)
2226 sum += (a[(k << 3) + i] * a[rowj_pos + k]);
2227 a[rowj_pos + i] = -sum * a[dpos];
2230 float ai[8], amax = 0;
2231 for (int i = 0; i < VN * 8; i += 8) {
2233 for (int k = 0; k < VN; k++) sum += a[rowj_pos + k] * a[i + k];
2235 amax = max(amax, sum);
2238 if (isinf(amax)) invalid[cam_id] = true;
2239 int write_pos = IMUL((index >> 3), (VN * 2)) + (col << 1);
2240 if (invalid[cam_id]) // a better way would be using a threshold
2242 blocks[write_pos] = make_float4(0, 0, 0, 0);
2243 blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
2245 blocks[write_pos] = make_float4(ai[0], ai[1], ai[2], ai[3]);
2246 blocks[write_pos + 1] =
2247 make_float4(ai[4], ai[5], ai[6], VN < 8 ? 0 : ai[7]);
2252 void ProgramCU::ComputeDiagonalBlock(float lambda, bool dampd, CuTexImage& jc,
2253 CuTexImage& cmap, CuTexImage& jp,
2254 CuTexImage& pmap, CuTexImage& cmlist,
2255 CuTexImage& diag, CuTexImage& blocks,
2256 int radial_distortion, bool jc_transpose,
2257 bool add_existing_diagc, int mode) {
2258 size_t szjc = jc.GetDataSize();
2259 unsigned int ncam = (cmap.GetImgWidth() - 1); // how many cameras
2260 float lambda1 = dampd ? 0.0f : lambda;
2261 float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
2262 const unsigned int bheight = 2;
2263 dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight);
2264 PBA_BIND_TEX1D(tex_jtjd_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
2268 } else if (radial_distortion) {
2270 jtjd_cam_block_vec32_kernel<8, bheight, true><<<grid1x, block1x>>>(
2271 ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2272 add_existing_diagc);
2274 PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
2275 jtjd_cam_block_vec32_kernel<8, bheight, false><<<grid1x, block1x>>>(
2276 ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2277 add_existing_diagc);
2281 jtjd_cam_block_vec32_kernel<7, bheight, true><<<grid1x, block1x>>>(
2282 ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2283 add_existing_diagc);
2285 PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
2286 jtjd_cam_block_vec32_kernel<7, bheight, false><<<grid1x, block1x>>>(
2287 ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
2288 add_existing_diagc);
2291 CheckErrorCUDA("ComputeDiagonalBlock<Camera>");
2293 ////////////////////////////////////////////
2294 unsigned int npoint = (pmap.GetImgWidth() - 1);
2295 unsigned int len2 = npoint;
2296 unsigned int bsize2 = JTJD_POINT_BLOCK_KWIDTH;
2297 unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
2298 unsigned int bw, bh;
2299 unsigned int offsetd = 2 * ncam;
2300 unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam;
2301 GetBlockConfiguration(nblock2, bw, bh);
2302 dim3 grid2(bw, bh), block2(bsize2);
2303 PBA_BIND_TEX1D(tex_jtjd_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
2305 // camera only mode?
2306 } else if (jp.GetDataSize() > MAX_TEXSIZE) {
2307 PBA_BIND_TEX1D_2(tex_jtjd_jp, tex_jtjd_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
2308 jtjd_point_block_kernel<2><<<grid2, block2>>>(
2309 len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd,
2310 ((float4*)blocks.data()) + offsetb);
2312 PBA_BIND_TEX1D(tex_jtjd_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2313 jtjd_point_block_kernel<1><<<grid2, block2>>>(
2314 len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd,
2315 ((float4*)blocks.data()) + offsetb);
2317 CheckErrorCUDA("ComputeDiagonalBlock<Point>");
2320 unsigned int len3 = ncam * 8;
2321 unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH;
2322 unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3;
2323 dim3 grid3(nblock3), block3(bsize3);
2324 if (radial_distortion)
2325 jtjd_cam_block_invert_kernel<8><<<grid3, block3>>>(
2326 len3, (float4*)blocks.data());
2328 jtjd_cam_block_invert_kernel<7><<<grid3, block3>>>(
2329 len3, (float4*)blocks.data());
2330 CheckErrorCUDA("ComputeDiagonalBlockInverse<Camera>");
2334 template <int WIDTH, int BBIT, int VSZ>
2335 __global__ void multiply_block_conditioner_kernel(int num, int rowsz,
2336 float* blocks, float* x,
2338 __shared__ float mat[WIDTH * VSZ];
2339 __shared__ float val[WIDTH];
2340 const int BSZ = 1 << BBIT;
2341 const int BMASK = BSZ - 1;
2342 int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y;
2343 int index = bindex + threadIdx.x;
2344 int block_read_pos = bindex * VSZ;
2345 val[threadIdx.x] = x[index];
2346 for (int i = 0; i < VSZ * WIDTH; i += WIDTH)
2347 mat[i + threadIdx.x] = blocks[i + block_read_pos + threadIdx.x];
2349 if (index >= num) return;
2350 float* ac = mat + (threadIdx.x >> BBIT) * (BSZ * VSZ) + (threadIdx.x & BMASK);
2351 float* xc = val + (threadIdx.x & (~BMASK));
2353 for (int i = 0; i < VSZ; ++i) sum += ac[i << BBIT] * xc[i];
2354 result[index] = sum; // isinf(sum) ? 0 : sum ; //
2357 void ProgramCU::MultiplyBlockConditioner(int ncam, int npoint,
2358 CuTexImage& blocks, CuTexImage& vector,
2359 CuTexImage& result, int radial,
2361 const unsigned int bsize1 = 64;
2362 unsigned int bw, bh;
2365 unsigned int len1 = ncam * 8;
2366 unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
2367 GetBlockConfiguration(nblock1, bw, bh);
2368 dim3 grid1(bw, bh), block1(bsize1);
2370 multiply_block_conditioner_kernel<bsize1, 3, 8><<<grid1, block1>>>(
2371 len1, (bw * bsize1), blocks.data(), vector.data(), result.data());
2373 multiply_block_conditioner_kernel<bsize1, 3, 7><<<grid1, block1>>>(
2374 len1, (bw * bsize1), blocks.data(), vector.data(), result.data());
2375 CheckErrorCUDA("MultiplyBlockConditioner<Camera>");
2379 const unsigned int bsize2 = 128;
2380 unsigned int len2 = npoint * 4;
2381 unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
2382 unsigned int cbsz = radial ? 64 : 56;
2383 unsigned int offsetb = ncam * cbsz;
2384 unsigned int offsetd = ncam * 8;
2385 GetBlockConfiguration(nblock2, bw, bh);
2386 dim3 grid2(bw, bh), block2(bsize2);
2387 multiply_block_conditioner_kernel<bsize2, 2, 3><<<grid2, block2>>>(
2388 len2, (bw * bsize2), blocks.data() + offsetb, vector.data() + offsetd,
2389 result.data() + offsetd);
2390 CheckErrorCUDA("MultiplyBlockConditioner<Point>");
2396 __global__ void shuffle_camera_jacobian_kernel(int num, int bwidth,
2398 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2399 if (index >= num) return;
2400 int fetch_idx = tex1Dfetch(tex_shuffle_map, index >> 2);
2402 int texidx = fetch_idx >> 23,
2403 fidx = ((fetch_idx & 0x7fffff) << 2) + (index & 0x3);
2405 jc[index] = tex1Dfetch(tex_shuffle_jc, fidx);
2406 else if (texidx == 1)
2407 jc[index] = tex1Dfetch(tex_shuffle_jc2, fidx);
2410 jc[index] = tex1Dfetch(tex_shuffle_jc, (fetch_idx << 2) + (index & 0x3));
2414 bool ProgramCU::ShuffleCameraJacobian(CuTexImage& jc, CuTexImage& map,
2415 CuTexImage& result) {
2416 if (!result.IsValid()) return false;
2417 size_t szjc = jc.GetDataSize();
2418 unsigned int len = map.GetImgWidth() * 4;
2419 unsigned int bsize = 128;
2420 unsigned int nblock = (len + bsize - 1) / bsize;
2422 PBA_BIND_TEX1D(tex_shuffle_map, map, cudaReadModeElementType, PBA_ChanInt());
2424 if (szjc > 2 * MAX_TEXSIZE) {
2425 fprintf(stderr, "datasize way too big %lX, %lX+...\n", szjc,
2426 (szjc) / MAX_TEXSIZE);
2428 } else if (szjc > MAX_TEXSIZE) {
2429 unsigned int bw, bh;
2430 GetBlockConfiguration(nblock, bw, bh);
2431 dim3 grid(bw, bh), block(bsize);
2432 PBA_BIND_TEX1D_2(tex_shuffle_jc, tex_shuffle_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
2433 shuffle_camera_jacobian_kernel<2><<<grid, block>>>(len, (bw * bsize),
2434 (float4*)result.data());
2436 PBA_BIND_TEX1D(tex_shuffle_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
2437 unsigned int bw, bh;
2438 GetBlockConfiguration(nblock, bw, bh);
2439 dim3 grid(bw, bh), block(bsize);
2440 shuffle_camera_jacobian_kernel<1><<<grid, block>>>(len, (bw * bsize),
2441 (float4*)result.data());
2443 CheckErrorCUDA("ShuffleCameraJacobian");
2450 __global__ void multiply_jx_kernel(int num, int bwidth, int offset,
2452 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2453 if (index >= num) return;
2455 if (TEXN == 4 && (index >> 24) == 3) {
2456 ////////////////////////////////////////////
2457 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2458 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2459 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2460 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2462 ////////////////////////////////////////////
2463 float4 jp, jc1, jc2;
2464 jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
2465 jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1);
2466 jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1);
2468 /////////////////////////////////////
2469 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2470 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2471 jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2473 } else if (TEXN > 2 && (index >> 24) == 2) {
2474 ////////////////////////////////////////////
2475 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2476 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2477 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2478 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2480 ////////////////////////////////////////////
2481 float4 jp, jc1, jc2;
2482 jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
2483 jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1);
2484 jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1);
2486 /////////////////////////////////////
2487 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2488 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2489 jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2491 } else if (TEXN > 1 && (index > 0xffffff)) {
2492 ////////////////////////////////////////////
2493 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2494 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2495 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2496 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2498 ////////////////////////////////////////////
2499 float4 jp, jc1, jc2;
2500 jp = tex1Dfetch(tex_mjx_jp, index & 0x1ffffff);
2501 jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1);
2502 jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1);
2504 /////////////////////////////////////
2505 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2506 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2507 jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2510 ////////////////////////////////////////////
2511 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2512 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2513 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2514 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2516 ////////////////////////////////////////////
2517 float4 jp, jc1, jc2;
2518 jp = tex1Dfetch(tex_mjx_jp, index);
2519 jc1 = tex1Dfetch(tex_mjx_jc, index << 1);
2520 jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1);
2522 /////////////////////////////////////
2523 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2524 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2525 jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
2531 __global__ void multiply_jcx_kernel(int num, int bwidth, float* result) {
2532 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2533 if (index >= num) return;
2535 if (TEXN == 4 && (index >> 24) == 3) {
2536 ////////////////////////////////////////////
2537 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2538 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2539 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2541 ////////////////////////////////////////////
2543 jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1);
2544 jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1);
2546 /////////////////////////////////////
2547 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2548 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2549 jc2.z * xc2.z + jc2.w * xc2.w;
2550 } else if (TEXN > 2 && (index >> 24) == 2) {
2551 ////////////////////////////////////////////
2552 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2553 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2554 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2556 ////////////////////////////////////////////
2558 jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1);
2559 jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1);
2561 /////////////////////////////////////
2562 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2563 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2564 jc2.z * xc2.z + jc2.w * xc2.w;
2565 } else if (TEXN > 1 && (index > 0xffffff)) {
2566 ////////////////////////////////////////////
2567 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2568 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2569 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2571 ////////////////////////////////////////////
2573 jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1);
2574 jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1);
2576 /////////////////////////////////////
2577 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2578 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2579 jc2.z * xc2.z + jc2.w * xc2.w;
2581 ////////////////////////////////////////////
2582 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2583 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2584 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2586 ////////////////////////////////////////////
2588 jc1 = tex1Dfetch(tex_mjx_jc, index << 1);
2589 jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1);
2591 /////////////////////////////////////
2592 result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
2593 jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
2594 jc2.z * xc2.z + jc2.w * xc2.w;
2599 __global__ void multiply_jpx_kernel(int num, int bwidth, int offset,
2601 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2602 if (index >= num) return;
2604 if (TEXN == 2 && index > 0x1ffffff) {
2605 ////////////////////////////////////////////
2606 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2607 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2608 ////////////////////////////////////////////
2609 float4 jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
2610 /////////////////////////////////////
2611 result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z;
2613 ////////////////////////////////////////////
2614 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2615 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2617 ////////////////////////////////////////////
2618 float4 jp = tex1Dfetch(tex_mjx_jp, index);
2619 /////////////////////////////////////
2620 result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z;
2625 __global__ void multiply_jx_notex2_kernel(int num, int bwidth, int offset,
2626 float* jcx, float* jpx,
2628 int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2629 int index = threadIdx.x + bindex;
2631 ////////////////////////////////////////////
2632 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2633 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2634 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2635 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2636 ////////////////////////////////////////////
2637 __shared__ float jps[KW * 4];
2638 __shared__ float jcs[KW * 8];
2640 for (int i = threadIdx.x; i < 4 * KW; i += KW)
2641 jps[i] = jpx[(bindex << 2) + i];
2642 for (int i = threadIdx.x; i < 8 * KW; i += KW)
2643 jcs[i] = jcx[(bindex << 3) + i];
2646 if (index >= num) return;
2648 /////////////////////////////////////
2649 float *jp = jps + threadIdx.x * 4, *jc = jcs + threadIdx.x * 8;
2650 result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z +
2651 jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y +
2652 jc[6] * xc2.z + jc[7] * xc2.w + jp[0] * xp.x + jp[1] * xp.y +
2657 __global__ void multiply_jpx_notex2_kernel(int num, int bwidth, int offset,
2658 float* jpx, float* result) {
2659 int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2660 int index = threadIdx.x + bindex;
2662 ////////////////////////////////////////////
2663 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2664 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
2665 ////////////////////////////////////////////
2666 __shared__ float jps[KW * 4];
2668 for (int i = threadIdx.x; i < 4 * KW; i += KW)
2669 jps[i] = jpx[(bindex << 2) + i];
2672 if (index >= num) return;
2674 /////////////////////////////////////
2675 float* jp = jps + threadIdx.x * 4;
2676 result[index] = jp[0] * xp.x + jp[1] * xp.y + jp[2] * xp.z;
2680 __global__ void multiply_jcx_notex2_kernel(int num, int bwidth, float* jcx,
2682 int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
2683 int index = threadIdx.x + bindex;
2685 ////////////////////////////////////////////
2686 int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
2687 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
2688 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
2689 ////////////////////////////////////////////
2691 __shared__ float jcs[KW * 8];
2692 for (int i = threadIdx.x; i < 8 * KW; i += KW)
2693 jcs[i] = jcx[(bindex << 3) + i];
2696 if (index >= num) return;
2698 /////////////////////////////////////
2699 float* jc = jcs + threadIdx.x * 8;
2700 result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z +
2701 jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y +
2702 jc[6] * xc2.z + jc[7] * xc2.w;
2705 void ProgramCU::ComputeJX(int point_offset, CuTexImage& x, CuTexImage& jc,
2706 CuTexImage& jp, CuTexImage& jmap, CuTexImage& result,
2708 // given a vector of parameters....
2709 // multiply the Jacobian Matrix with it [jc jp] * p
2710 // for each measurment, read back the jacobian
2711 // multiply and summ up th corresponding
2713 unsigned int nproj = jmap.GetImgWidth();
2714 unsigned int len = nproj * 2;
2715 unsigned int bsize = 64;
2716 unsigned int nblock = (len + bsize - 1) / bsize;
2717 unsigned int bw, bh;
2718 PBA_BIND_TEX1D(tex_mjx_idx, jmap, cudaReadModeElementType, PBA_ChanInt2());
2719 PBA_BIND_TEX1D(tex_mjx_x, x, cudaReadModeElementType, PBA_ChanFloat4());
2722 size_t szjc = jc.GetDataSize();
2723 if (TEX_TOOBIG4(szjc)) {
2724 GetBlockConfiguration(nblock, bw, bh);
2725 dim3 grid(bw, bh), block(bsize);
2726 multiply_jx_notex2_kernel<64><<<grid, block>>>(
2727 len, (bw * bsize), point_offset, jc.data(), jp.data(), result.data());
2728 } else if (szjc > 2 * MAX_TEXSIZE) {
2729 PBA_BIND_TEX1D_2(tex_mjx_jp, tex_mjx_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
2730 PBA_BIND_TEX1D_4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4, jc, cudaReadModeElementType, PBA_ChanFloat4());
2731 GetBlockConfiguration(nblock, bw, bh);
2732 dim3 grid(bw, bh), block(bsize);
2733 multiply_jx_kernel<4><<<grid, block>>>(len, (bw * bsize), point_offset,
2735 } else if (szjc > MAX_TEXSIZE) {
2736 PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2737 PBA_BIND_TEX1D_2(tex_mjx_jc, tex_mjx_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
2738 GetBlockConfiguration(nblock, bw, bh);
2739 dim3 grid(bw, bh), block(bsize);
2740 multiply_jx_kernel<2><<<grid, block>>>(len, (bw * bsize), point_offset,
2743 PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2744 PBA_BIND_TEX1D(tex_mjx_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
2745 GetBlockConfiguration(nblock, bw, bh);
2746 dim3 grid(bh, bw), block(bsize);
2747 multiply_jx_kernel<1><<<grid, block>>>(len, (bh * bsize), point_offset,
2750 CheckErrorCUDA("ComputeJX");
2751 } else if (mode == 1) {
2752 size_t szjc = jc.GetDataSize();
2753 if (TEX_TOOBIG4(szjc)) {
2754 GetBlockConfiguration(nblock, bw, bh);
2755 dim3 grid(bw, bh), block(bsize);
2756 multiply_jcx_notex2_kernel<64><<<grid, block>>>(len, (bw * bsize),
2757 jc.data(), result.data());
2758 } else if (szjc > 2 * MAX_TEXSIZE) {
2759 PBA_BIND_TEX1D_4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4, jc, cudaReadModeElementType, PBA_ChanFloat4());
2760 GetBlockConfiguration(nblock, bw, bh);
2761 dim3 grid(bw, bh), block(bsize);
2762 multiply_jcx_kernel<4><<<grid, block>>>(len, (bw * bsize), result.data());
2763 } else if (szjc > MAX_TEXSIZE) {
2764 PBA_BIND_TEX1D_2(tex_mjx_jc, tex_mjx_jc2, jc, cudaReadModeElementType, PBA_ChanFloat4());
2765 GetBlockConfiguration(nblock, bw, bh);
2766 dim3 grid(bw, bh), block(bsize);
2767 multiply_jcx_kernel<2><<<grid, block>>>(len, (bw * bsize), result.data());
2769 PBA_BIND_TEX1D(tex_mjx_jc, jc, cudaReadModeElementType, PBA_ChanFloat4());
2770 GetBlockConfiguration(nblock, bw, bh);
2771 dim3 grid(bh, bw), block(bsize);
2772 multiply_jcx_kernel<1><<<grid, block>>>(len, (bh * bsize), result.data());
2774 CheckErrorCUDA("ComputeJCX");
2775 } else if (mode == 2) {
2776 size_t szjp = jp.GetDataSize();
2777 if (szjp > MAX_TEXSIZE) {
2778 PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2779 GetBlockConfiguration(nblock, bw, bh);
2780 dim3 grid(bw, bh), block(bsize);
2781 multiply_jpx_kernel<2><<<grid, block>>>(len, (bw * bsize), point_offset,
2784 PBA_BIND_TEX1D(tex_mjx_jp, jp, cudaReadModeElementType, PBA_ChanFloat4());
2785 GetBlockConfiguration(nblock, bw, bh);
2786 dim3 grid(bh, bw), block(bsize);
2787 multiply_jpx_kernel<1><<<grid, block>>>(len, (bh * bsize), point_offset,
2790 CheckErrorCUDA("ComputeJPX");
2794 template <bool md, bool pd>
2795 __device__ void jacobian_internal(int camera_pos, int pt_pos, int tidx,
2796 float* r, float jic, float* jxc, float* jyc,
2797 float* jxp, float* jyp) {
2799 float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
2800 float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
2805 float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
2810 float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
2813 float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
2818 float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
2819 float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
2820 float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
2821 float f_p2 = FDIV(ft.x, z0 + ft.w);
2822 float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
2823 float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
2826 float rr1 = r3.y * p0_p2 * p0_p2;
2827 float rr2 = r3.y * p1_p2 * p1_p2;
2828 float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
2829 float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
2831 JACOBIAN_SET_JC_BEGIN
2832 float jfc = jic * (1 + rr1 + rr2);
2833 float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
2834 /////////////////////////////////////////////////////
2835 jxc[0] = p0_p2 * jfc;
2838 jxc[3] = -f_p2_x * p0_p2;
2839 jxc[4] = -f_p2_x * p0_p2 * y0;
2840 jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
2841 jxc[6] = -f_p2_x * y0;
2842 jxc[7] = ft_x_pn * p0_p2;
2844 jyc[0] = p1_p2 * jfc;
2847 jyc[3] = -f_p2_y * p1_p2;
2848 jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
2849 jyc[5] = f_p2_y * x0 * p1_p2;
2850 jyc[6] = f_p2_y * x0;
2851 jyc[7] = ft_x_pn * p1_p2;
2853 ///////////////////////////////////
2854 jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
2855 jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
2856 jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
2857 jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
2858 jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
2859 jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
2861 JACOBIAN_SET_JC_BEGIN
2862 jxc[0] = p0_p2 * jic;
2865 jxc[3] = -f_p2 * p0_p2;
2866 jxc[4] = -f_p2 * p0_p2 * y0;
2867 jxc[5] = f_p2 * (z0 + x0 * p0_p2);
2868 jxc[6] = -f_p2 * y0;
2870 jyc[0] = p1_p2 * jic;
2873 jyc[3] = -f_p2 * p1_p2;
2874 jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
2875 jyc[5] = f_p2 * x0 * p1_p2;
2879 float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
2880 float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
2881 jxc[7] = -ms.x * msn;
2882 jyc[7] = -ms.y * msn;
2888 ///////////////////////////////////
2889 jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
2890 jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
2891 jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
2892 jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
2893 jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
2894 jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
2898 template <bool md, bool pd>
2899 __device__ void jacobian_camera_internal(int camera_pos, int pt_pos, int tidx,
2900 float* r, float jic, float* jxc,
2903 float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
2904 float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
2909 float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
2914 float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
2917 float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
2922 float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
2923 float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
2924 float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
2925 float f_p2 = FDIV(ft.x, z0 + ft.w);
2926 float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
2927 float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
2928 #ifndef PBA_DISABLE_CONST_CAMERA
2949 float rr1 = r3.y * p0_p2 * p0_p2;
2950 float rr2 = r3.y * p1_p2 * p1_p2;
2951 float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
2952 float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
2953 float jfc = jic * (1 + rr1 + rr2);
2954 float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
2955 /////////////////////////////////////////////////////
2956 jxc[0] = p0_p2 * jfc;
2959 jxc[3] = -f_p2_x * p0_p2;
2960 jxc[4] = -f_p2_x * p0_p2 * y0;
2961 jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
2962 jxc[6] = -f_p2_x * y0;
2963 jxc[7] = ft_x_pn * p0_p2;
2965 jyc[0] = p1_p2 * jfc;
2968 jyc[3] = -f_p2_y * p1_p2;
2969 jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
2970 jyc[5] = f_p2_y * x0 * p1_p2;
2971 jyc[6] = f_p2_y * x0;
2972 jyc[7] = ft_x_pn * p1_p2;
2974 jxc[0] = p0_p2 * jic;
2977 jxc[3] = -f_p2 * p0_p2;
2978 jxc[4] = -f_p2 * p0_p2 * y0;
2979 jxc[5] = f_p2 * (z0 + x0 * p0_p2);
2980 jxc[6] = -f_p2 * y0;
2982 jyc[0] = p1_p2 * jic;
2985 jyc[3] = -f_p2 * p1_p2;
2986 jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
2987 jyc[5] = f_p2 * x0 * p1_p2;
2991 float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
2992 float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
2993 jxc[7] = -ms.x * msn;
2994 jyc[7] = -ms.y * msn;
3003 __device__ void jacobian_point_internal(int camera_pos, int pt_pos, int tidx,
3004 float* r, float* jxp, float* jyp) {
3006 float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
3007 float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
3012 float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
3017 float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
3020 float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
3025 float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
3026 float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
3027 float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
3028 float f_p2 = FDIV(ft.x, z0 + ft.w);
3029 float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
3030 float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
3033 float rr1 = r3.y * p0_p2 * p0_p2;
3034 float rr2 = r3.y * p1_p2 * p1_p2;
3035 float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
3036 float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
3037 ///////////////////////////////////
3038 jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
3039 jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
3040 jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
3041 jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
3042 jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
3043 jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
3045 ///////////////////////////////////
3046 jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
3047 jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
3048 jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
3049 jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
3050 jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
3051 jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
3055 template <bool md, bool pd>
3056 __global__ void multiply_jx_noj_kernel(int num, int bwidth, int offset,
3057 float jic, float2* result) {
3058 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3059 if (index >= num) return;
3061 __shared__ float data[9 * 64];
3062 ////////////////////////////////////////////
3063 int2 proj = tex1Dfetch(tex_mjx_idx, index);
3064 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
3065 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
3066 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
3068 ////////////////////////////////////////////
3069 float jxc[8], jyc[8], jxp[3], jyp[3];
3070 jacobian_internal<md, pd>(proj.x << 1, proj.y, index, data + 9 * threadIdx.x,
3071 jic, jxc, jyc, jxp, jyp);
3073 /////////////////////////////////////
3074 result[index] = make_float2(
3075 jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w +
3076 jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w +
3077 jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z,
3078 jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w +
3079 jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w +
3080 jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z);
3083 template <bool md, bool pd>
3084 __global__ void multiply_jcx_noj_kernel(int num, int bwidth, float jic,
3086 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3087 if (index >= num) return;
3089 __shared__ float data[9 * 64];
3090 ////////////////////////////////////////////
3091 int2 proj = tex1Dfetch(tex_mjx_idx, index);
3092 float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
3093 float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
3095 ////////////////////////////////////////////
3096 float jxc[8], jyc[8];
3097 jacobian_camera_internal<md, pd>(proj.x << 1, proj.y, index,
3098 data + 9 * threadIdx.x, jic, jxc, jyc);
3100 /////////////////////////////////////
3101 result[index] = make_float2(
3102 jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w +
3103 jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w,
3104 jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w +
3105 jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w);
3109 __global__ void multiply_jpx_noj_kernel(int num, int bwidth, int offset,
3111 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3112 if (index >= num) return;
3114 __shared__ float data[9 * 64];
3115 ////////////////////////////////////////////
3116 int2 proj = tex1Dfetch(tex_mjx_idx, index);
3117 float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
3119 ////////////////////////////////////////////
3120 float jxp[3], jyp[3];
3121 jacobian_point_internal<pd>(proj.x << 1, proj.y, index,
3122 data + 9 * threadIdx.x, jxp, jyp);
3124 /////////////////////////////////////
3125 result[index] = make_float2(jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z,
3126 jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z);
3129 void ProgramCU::ComputeJX_(CuTexImage& x, CuTexImage& jx, CuTexImage& camera,
3130 CuTexImage& point, CuTexImage& meas,
3131 CuTexImage& pjmap, bool intrinsic_fixed,
3132 int radial_distortion, int mode) {
3133 unsigned int nproj = pjmap.GetImgWidth();
3134 unsigned int len = nproj;
3135 unsigned int bsize = 64;
3136 unsigned int nblock = (len + bsize - 1) / bsize;
3137 unsigned int bw, bh;
3138 int point_offset = camera.GetImgWidth() * 2;
3139 float jfc = intrinsic_fixed ? 0 : 1.0f;
3141 /////////////////////////////
3142 PBA_BIND_TEX1D(tex_mjx_idx, pjmap, cudaReadModeElementType, PBA_ChanInt2());
3143 PBA_BIND_TEX1D(tex_mjx_x, x, cudaReadModeElementType, PBA_ChanFloat4());
3144 PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3145 PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
3147 ///////////////////////////////////
3148 GetBlockConfiguration(nblock, bw, bh);
3149 dim3 grid(bw, bh), block(bsize);
3152 if (radial_distortion == -1) {
3153 PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3154 multiply_jx_noj_kernel<true, false><<<grid, block>>>(
3155 len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
3156 } else if (radial_distortion) {
3157 multiply_jx_noj_kernel<false, true><<<grid, block>>>(
3158 len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
3160 multiply_jx_noj_kernel<false, false><<<grid, block>>>(
3161 len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
3164 CheckErrorCUDA("ComputeJX_");
3165 } else if (mode == 1) {
3166 if (radial_distortion == -1) {
3167 PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3168 multiply_jcx_noj_kernel<true, false><<<grid, block>>>(
3169 len, (bw * bsize), jfc, (float2*)jx.data());
3170 } else if (radial_distortion) {
3171 multiply_jcx_noj_kernel<false, true><<<grid, block>>>(
3172 len, (bw * bsize), jfc, (float2*)jx.data());
3174 multiply_jcx_noj_kernel<false, false><<<grid, block>>>(
3175 len, (bw * bsize), jfc, (float2*)jx.data());
3178 CheckErrorCUDA("ComputeJCX_");
3179 } else if (mode == 2) {
3180 if (radial_distortion == 1) {
3181 multiply_jpx_noj_kernel<true><<<grid, block>>>(
3182 len, (bw * bsize), point_offset, (float2*)jx.data());
3184 multiply_jpx_noj_kernel<false><<<grid, block>>>(
3185 len, (bw * bsize), point_offset, (float2*)jx.data());
3188 CheckErrorCUDA("ComputeJX_");
3192 template <bool md, bool pd, int KH>
3193 __global__ void jte_cam_vec_noj_kernel(int num, int rowsz, float jic,
3195 __shared__ float value[KH * 32 * 9]; // 8 * KH * 32
3196 int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
3197 if (cam >= num) return;
3199 // read data range for this camera
3200 // 8 thread will do the same thing
3201 int idx1 = tex1Dfetch(tex_jte_cmp, cam); // first camera
3202 int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1); // last camera + 1
3204 float* valuec = value + 32 * 9 * threadIdx.y;
3205 float* rp = valuec + threadIdx.x * 9;
3206 float rr[8], jxc[8], jyc[8];
3207 for (int i = 0; i < 8; ++i) rr[i] = 0;
3209 // loop to read the index of the projection.
3210 // so to get the location to read the jacobian
3211 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
3212 int index = tex1Dfetch(tex_jte_cmt, i);
3213 int2 proj = tex1Dfetch(tex_jacobian_idx, index);
3214 jacobian_camera_internal<md, pd>(cam << 2, proj.y, index, rp, jic, jxc,
3216 float2 vv = tex1Dfetch(tex_jte_pe, index);
3218 for (int j = 0; j < 8; ++j) rr[j] += (jxc[j] * vv.x + jyc[j] * vv.y);
3221 float* valuei = valuec + 8 * threadIdx.x;
3222 for (int i = 0; i < 8; ++i) valuei[i] = rr[i];
3223 valuec[threadIdx.x] = (valuec[threadIdx.x] + valuec[threadIdx.x + 32] +
3224 valuec[threadIdx.x + 64] + valuec[threadIdx.x + 96] +
3225 valuec[threadIdx.x + 128] + valuec[threadIdx.x + 160] +
3226 valuec[threadIdx.x + 192] + valuec[threadIdx.x + 224]);
3227 if (threadIdx.x < 16) valuec[threadIdx.x] += valuec[threadIdx.x + 16];
3228 if (threadIdx.x < 8)
3229 valuec[threadIdx.x] = valuec[threadIdx.x] + valuec[threadIdx.x + 8];
3231 ////////////////////////////////////
3232 if (threadIdx.x < 8) jte[(cam << 3) + threadIdx.x] = valuec[threadIdx.x];
3235 template <bool pd, int KH>
3236 __global__ void jte_point_vec_noj_kernel(int num, int rowsz, float* jte) {
3237 ////////////////////////////
3238 __shared__ float value[KH * (9 * 32)];
3239 int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
3240 if (index >= num) return;
3242 int idx1 = tex1Dfetch(tex_jte_pmp, index); // first
3243 int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last + 1
3244 float rx = 0, ry = 0, rz = 0, jxp[3], jyp[3];
3245 int rowp = threadIdx.y * 9 * 32;
3246 float* rp = value + threadIdx.x * 9 + rowp;
3247 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
3248 float2 ev = tex1Dfetch(tex_jte_pe, i);
3249 int2 proj = tex1Dfetch(tex_jacobian_idx, i);
3250 jacobian_point_internal<pd>(proj.x << 1, proj.y, i, rp, jxp, jyp);
3251 rx += (jxp[0] * ev.x + jyp[0] * ev.y);
3252 ry += (jxp[1] * ev.x + jyp[1] * ev.y);
3253 rz += (jxp[2] * ev.x + jyp[2] * ev.y);
3256 int loc = (threadIdx.x << 2) + rowp;
3258 value[loc + 1] = ry;
3259 value[loc + 2] = rz;
3262 int ridx = threadIdx.x + rowp;
3263 value[ridx] = ((value[ridx] + value[ridx + 32]) +
3264 (value[ridx + 64] + value[ridx + 96]));
3265 if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
3266 if (threadIdx.x < 8) value[ridx] += value[ridx + 8];
3267 if (threadIdx.x < 4)
3268 jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4];
3271 void ProgramCU::ComputeJtE_(CuTexImage& e, CuTexImage& jte, CuTexImage& camera,
3272 CuTexImage& point, CuTexImage& meas,
3273 CuTexImage& cmap, CuTexImage& cmlist,
3274 CuTexImage& pmap, CuTexImage& pjmap, CuTexImage& jp,
3275 bool intrinsic_fixed, int radial_distortion,
3277 PBA_BIND_TEX1D(tex_jacobian_idx, pjmap, cudaReadModeElementType, PBA_ChanInt2());
3278 PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3279 PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
3280 if (radial_distortion) PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3282 PBA_BIND_TEX1D(tex_jte_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
3283 PBA_BIND_TEX1D(tex_jte_cmt, cmlist, cudaReadModeElementType, PBA_ChanInt());
3284 PBA_BIND_TEX1D(tex_jte_pe, e, cudaReadModeElementType, PBA_ChanFloat2());
3287 unsigned int bw, bh;
3288 float jfc = intrinsic_fixed ? 0 : 1.0f;
3289 int ncam = camera.GetImgWidth();
3290 const int bheight1 = 2, bsize = 32;
3291 int nblock1 = (ncam + bheight1 - 1) / bheight1;
3292 GetBlockConfiguration(nblock1, bw, bh);
3293 dim3 grid(bw, bh), block(bsize, bheight1);
3295 } else if (radial_distortion == -1)
3296 jte_cam_vec_noj_kernel<true, false, bheight1><<<grid, block>>>(
3297 ncam, bw * bheight1, jfc, jte.data());
3298 else if (radial_distortion)
3299 jte_cam_vec_noj_kernel<false, true, bheight1><<<grid, block>>>(
3300 ncam, bw * bheight1, jfc, jte.data());
3302 jte_cam_vec_noj_kernel<false, false, bheight1><<<grid, block>>>(
3303 ncam, bw * bheight1, jfc, jte.data());
3304 CheckErrorCUDA("ComputeJtE_<Camera>");
3306 int npt = point.GetImgWidth();
3307 unsigned int offsetv = 8 * ncam;
3308 const int bheight2 = 2, bsize2 = 32;
3309 int nblock2 = (npt + bheight2 - 1) / bheight2;
3310 GetBlockConfiguration(nblock2, bw, bh);
3311 dim3 grid2(bw, bh), block2(bsize2, bheight2);
3313 } else if (jp.IsValid()) {
3314 PBA_BIND_TEX1D(tex_jte_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
3315 PBA_BIND_TEX1D(tex_jte_pex, e, cudaReadModeElementType, PBA_ChanFloat());
3316 PBA_BIND_TEX1D_2(tex_jte_jp, tex_jte_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
3317 if (jp.GetDataSize() > MAX_TEXSIZE)
3318 jte_point_vec_kernel<bheight2, 2><<<grid2, block2>>>(
3319 npt, bw * bheight2, jte.data() + offsetv);
3321 jte_point_vec_kernel<bheight2, 1><<<grid2, block2>>>(
3322 npt, bw * bheight2, jte.data() + offsetv);
3324 PBA_BIND_TEX1D(tex_jte_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
3325 if (radial_distortion && radial_distortion != -1)
3326 jte_point_vec_noj_kernel<true, bheight2><<<grid2, block2>>>(
3327 npt, bw * bheight2, jte.data() + offsetv);
3329 jte_point_vec_noj_kernel<false, bheight2><<<grid2, block2>>>(
3330 npt, bw * bheight2, jte.data() + offsetv);
3332 CheckErrorCUDA("ComputeJtE_<Point>");
3335 template <int KH, bool md, bool pd, bool scaling>
3336 __global__ void jtjd_cam_block_noj_kernel(int num, int rowsz, float lambda1,
3337 float lambda2, float jic, float* diag,
3339 bool add_existing_diagc) {
3340 const int VN = (md || pd) ? 8 : 7;
3341 __shared__ float buffer_all[32 * 9 * KH];
3342 __shared__ float value_all[64 * KH];
3344 // 8thread per camera
3345 int bcam = blockIdx.x * KH + blockIdx.y * rowsz;
3347 int cam = bcam + threadIdx.y;
3348 if (cam >= num) return;
3350 float* buffer = buffer_all + threadIdx.y * (32 * 9);
3351 float* value = value_all + threadIdx.y * 64;
3353 float jxc[8], jyc[8];
3354 float* rp = buffer + threadIdx.x * 9;
3355 float row0[VN], row1[VN - 1], row2[VN - 2], row3[VN - 3];
3356 float row4[VN - 4], row5[VN - 5], row6[VN - 6], row7[1] = {0};
3357 // read data range for this camera
3358 // 8 thread will do the same thing
3359 int idx1 = tex1Dfetch(tex_jtjd_cmp, cam); // first camera
3360 int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1); // last camera + 1
3362 #define REPEAT7(FUNC) \
3370 #define SETZERO(k) \
3371 for (int j = 0; j < VN - k; ++j) row##k[j] = 0;
3375 if (scaling && (pd || md)) {
3376 sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1));
3377 sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1);
3380 // loop to read the index of the projection.
3381 // so to get the location to read the jacobian
3382 for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
3383 /////////////////////////////////////////
3384 int index = tex1Dfetch(tex_jtjd_cmlist, i);
3385 int2 proj = tex1Dfetch(tex_jacobian_idx, index);
3387 ///////////////////////////////////////////////
3388 jacobian_camera_internal<md, pd>(cam << 2, proj.y, index, rp, jic, jxc,
3391 if (scaling && (pd || md)) {
3392 float* sj = (float*)sjv; // 32 threads...64 values
3393 for (int j = 0; j < VN; ++j) {
3399 ////////////////////////////////////////////////
3401 for (int j = k; j < VN; ++j) \
3402 row##k[j - k] += (jxc[k] * jxc[j] + jyc[k] * jyc[j])
3411 ////////////////////////////////////
3412 // make the matrix..//add up the 32 * 8 matrix
3413 #define JTJDSUM8_V1() \
3414 buffer[threadIdx.x] = \
3415 (buffer[threadIdx.x] + buffer[threadIdx.x + 32] + \
3416 buffer[threadIdx.x + 64] + buffer[threadIdx.x + 96] + \
3417 buffer[threadIdx.x + 128] + buffer[threadIdx.x + 160] + \
3418 buffer[threadIdx.x + 192] + buffer[threadIdx.x + 224]);
3420 #define JTJDSUM8_V2() \
3421 buffer[threadIdx.x] = \
3422 (((buffer[threadIdx.x] + buffer[threadIdx.x + 128]) + \
3423 (buffer[threadIdx.x + 64] + buffer[threadIdx.x + 192])) + \
3424 ((buffer[threadIdx.x + 32] + buffer[threadIdx.x + 160]) + \
3425 (buffer[threadIdx.x + 96] + buffer[threadIdx.x + 224])));
3427 #define STORE_ROWS(k) \
3428 for (int i = 0; i < (VN - k); ++i) bufi[i] = row##k[i]; \
3430 if (threadIdx.x < 16 - k) buffer[threadIdx.x] += buffer[threadIdx.x + 16]; \
3431 if (threadIdx.x < 8 - k) \
3432 value[threadIdx.x + k * 9] = buffer[threadIdx.x] + buffer[threadIdx.x + 8];
3434 float* bufi = buffer + threadIdx.x * 8;
3435 REPEAT7(STORE_ROWS);
3440 /////////////////////////////////////////////////////////////////////////////////////////////
3442 //////////////////////////////// (8 * i + j) -> (8 * j + i)
3443 //#define COPYSYM(i) if(threadIdx.x < VN - i - 1) value[threadIdx.x * 8 + i *
3444 //9 + 8] = value[threadIdx.x + i * 9 + 1];
3445 if (threadIdx.x < VN - 1) value[threadIdx.x * 8 + 8] = value[threadIdx.x + 1];
3446 if (threadIdx.x < VN - 2)
3447 value[threadIdx.x * 8 + 17] = value[threadIdx.x + 10];
3448 if (threadIdx.x < VN - 3)
3449 value[threadIdx.x * 8 + 26] = value[threadIdx.x + 19];
3450 if (threadIdx.x < VN - 4)
3451 value[threadIdx.x * 8 + 35] = value[threadIdx.x + 28];
3452 if (threadIdx.x < VN - 5)
3453 value[threadIdx.x * 8 + 44] = value[threadIdx.x + 37];
3454 if (threadIdx.x < VN - 6)
3455 value[threadIdx.x * 8 + 53] = value[threadIdx.x + 46];
3456 if (VN == 8 && threadIdx.x < VN - 7)
3457 value[threadIdx.x * 8 + 62] = value[threadIdx.x + 55];
3459 if (scaling && !pd && !md) {
3461 float* sj = (float*)sjv; // 32 threads...64 values
3462 sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1));
3463 sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1);
3464 float sji = sj[threadIdx.x & 0x07];
3465 value[threadIdx.x] *= (sji * sj[threadIdx.x / 8]);
3466 value[threadIdx.x + 32] *= (sji * sj[4 + threadIdx.x / 8]);
3469 bool zero = ((threadIdx.x & 0x7) == VN);
3471 ///////////write back
3472 if (threadIdx.x < 8) {
3473 float* dp = value + threadIdx.x * 9;
3474 float temp = zero ? 0 : dp[0];
3475 int didx = threadIdx.x + (cam << 3);
3476 if (add_existing_diagc) temp += diag[didx];
3478 dp[0] = lambda1 + lambda2 * temp;
3480 int wpos = cam * (8 * VN) + threadIdx.x;
3481 blocks[wpos] = zero ? 0 : value[threadIdx.x];
3482 if (threadIdx.x < VN * 8 - 32)
3483 blocks[wpos + 32] = zero ? 0 : value[threadIdx.x + 32];
3486 template <int KW, bool pd, bool scaling>
3487 __global__ void jtjd_point_block_noj_kernel(int num, int rowsz, float lambda1,
3488 float lambda2, float4* diag,
3489 float4* blocks, int ptx) {
3490 ////////////////////////////
3491 int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
3492 if (index >= num) return;
3494 __shared__ float value[KW * 9];
3495 int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first
3496 int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last + 1
3498 float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
3499 float jxp[3], jyp[3];
3500 float* rp = value + threadIdx.x * 9;
3503 if (scaling && pd) sj = tex1Dfetch(tex_jacobian_sj, index + ptx);
3505 for (int i = idx1; i < idx2; ++i) {
3506 int2 proj = tex1Dfetch(tex_jacobian_idx, i);
3507 jacobian_point_internal<pd>(proj.x << 1, proj.y, i, rp, jxp, jyp);
3509 if (scaling && pd) {
3517 M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
3518 M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
3519 M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
3520 M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
3521 M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
3522 M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
3525 if (scaling && !pd) {
3526 sj = tex1Dfetch(tex_jacobian_sj, index + ptx);
3527 M00 *= (sj.x * sj.x);
3528 M01 *= (sj.x * sj.y);
3529 M02 *= (sj.x * sj.z);
3530 M11 *= (sj.y * sj.y);
3531 M12 *= (sj.y * sj.z);
3532 M22 *= (sj.z * sj.z);
3535 diag[index] = make_float4(M00, M11, M22, 0);
3537 M00 = lambda2 * M00 + lambda1;
3538 M11 = lambda2 * M11 + lambda1;
3539 M22 = lambda2 * M22 + lambda1;
3541 // invert the 3x3 matrix.
3542 float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 -
3543 M02 * M02 * M11 - M12 * M12 * M00;
3544 if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
3545 int write_pos = index * 3;
3546 blocks[write_pos] = make_float4(0, 0, 0, 0);
3547 blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
3548 blocks[write_pos + 2] = make_float4(0, 0, 0, 0);
3550 float m00 = (M11 * M22 - M12 * M12) / det;
3551 float m01 = -(M01 * M22 - M12 * M02) / det;
3552 float m02 = (M01 * M12 - M02 * M11) / det;
3553 int write_pos = index * 3;
3554 blocks[write_pos] = make_float4(m00, m01, m02, 0);
3556 float m11 = (M00 * M22 - M02 * M02) / det;
3557 float m12 = -(M00 * M12 - M01 * M02) / det;
3558 blocks[write_pos + 1] = make_float4(m01, m11, m12, 0);
3560 float m22 = (M00 * M11 - M01 * M01) / det;
3561 blocks[write_pos + 2] = make_float4(m02, m12, m22, 0);
3565 void ProgramCU::ComputeDiagonalBlock_(
3566 float lambda, bool dampd, CuTexImage& camera, CuTexImage& point,
3567 CuTexImage& meas, CuTexImage& cmap, CuTexImage& cmlist, CuTexImage& pmap,
3568 CuTexImage& jmap, CuTexImage& jp, CuTexImage& sj, CuTexImage& diag,
3569 CuTexImage& blocks, bool intrinsic_fixed, int radial_distortion,
3570 bool add_existing_diagc, int mode) {
3571 float lambda1 = dampd ? 0.0f : lambda;
3572 float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
3573 float jfc = intrinsic_fixed ? 0.0f : 1.0f;
3575 //////////////////////////////////
3576 PBA_BIND_TEX1D(tex_jacobian_idx, jmap, cudaReadModeElementType, PBA_ChanInt2());
3577 PBA_BIND_TEX1D(tex_jacobian_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3578 PBA_BIND_TEX1D(tex_jacobian_pts, point, cudaReadModeElementType, PBA_ChanFloat4());
3579 PBA_BIND_TEX1D(tex_jtjd_cmp, cmap, cudaReadModeElementType, PBA_ChanInt());
3580 PBA_BIND_TEX1D(tex_jtjd_cmlist, cmlist, cudaReadModeElementType, PBA_ChanInt());
3582 ////////////////////////////////////////////////////
3583 const unsigned int bsize1 = 32;
3584 const unsigned int bheight1 = 2;
3585 unsigned int ncam = camera.GetImgWidth(); // how many cameras
3586 unsigned int nblock = (ncam + bheight1 - 1) / bheight1;
3587 unsigned int bw, bh;
3588 GetBlockConfiguration(nblock, bw, bh);
3589 dim3 block1(bsize1, bheight1), grid1(bw, bh);
3591 ///////////////////////////////////////////////////
3592 if (radial_distortion == -1) PBA_BIND_TEX1D(tex_jacobian_meas, meas, cudaReadModeElementType, PBA_ChanFloat2());
3594 // skip the camera part.
3595 } else if (sj.IsValid()) {
3596 PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3597 if (radial_distortion == -1)
3598 jtjd_cam_block_noj_kernel<bheight1, true, false, true><<<grid1, block1>>>(
3599 ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3600 blocks.data(), add_existing_diagc);
3601 else if (radial_distortion)
3602 jtjd_cam_block_noj_kernel<bheight1, false, true, true><<<grid1, block1>>>(
3603 ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3604 blocks.data(), add_existing_diagc);
3606 jtjd_cam_block_noj_kernel<bheight1, false, false,
3607 true><<<grid1, block1>>>(
3608 ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3609 blocks.data(), add_existing_diagc);
3611 if (radial_distortion == -1)
3612 jtjd_cam_block_noj_kernel<bheight1, true, false,
3613 false><<<grid1, block1>>>(
3614 ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3615 blocks.data(), add_existing_diagc);
3616 else if (radial_distortion)
3617 jtjd_cam_block_noj_kernel<bheight1, false, true,
3618 false><<<grid1, block1>>>(
3619 ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3620 blocks.data(), add_existing_diagc);
3622 jtjd_cam_block_noj_kernel<bheight1, false, false,
3623 false><<<grid1, block1>>>(
3624 ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
3625 blocks.data(), add_existing_diagc);
3627 CheckErrorCUDA("ComputeDiagonalBlock_<Camera>");
3629 ////////////////////////////////////////////////////
3630 const unsigned int bsize2 = 64;
3631 unsigned int npoint = point.GetImgWidth();
3632 unsigned int len2 = npoint;
3633 unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
3634 unsigned int offsetd = 2 * ncam;
3635 unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam;
3636 GetBlockConfiguration(nblock2, bw, bh);
3637 dim3 grid2(bw, bh), block2(bsize2);
3638 PBA_BIND_TEX1D(tex_jtjd_pmp, pmap, cudaReadModeElementType, PBA_ChanInt());
3641 } else if (jp.IsValid()) {
3642 PBA_BIND_TEX1D_2(tex_jtjd_jp, tex_jtjd_jp2, jp, cudaReadModeElementType, PBA_ChanFloat4());
3643 if (jp.GetDataSize() > MAX_TEXSIZE)
3644 jtjd_point_block_kernel<2><<<grid2, block2>>>(
3645 len2, (bw * bsize2), lambda1, lambda2,
3646 ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb);
3648 jtjd_point_block_kernel<1><<<grid2, block2>>>(
3649 len2, (bw * bsize2), lambda1, lambda2,
3650 ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb);
3653 PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3654 if (radial_distortion && radial_distortion != -1)
3655 jtjd_point_block_noj_kernel<bsize2, true, true><<<grid2, block2>>>(
3656 len2, (bw * bsize2), lambda1, lambda2,
3657 ((float4*)diag.data()) + offsetd,
3658 ((float4*)blocks.data()) + offsetb, offsetd);
3660 jtjd_point_block_noj_kernel<bsize2, false, true><<<grid2, block2>>>(
3661 len2, (bw * bsize2), lambda1, lambda2,
3662 ((float4*)diag.data()) + offsetd,
3663 ((float4*)blocks.data()) + offsetb, offsetd);
3665 if (radial_distortion && radial_distortion != -1)
3666 jtjd_point_block_noj_kernel<bsize2, true, false><<<grid2, block2>>>(
3667 len2, (bw * bsize2), lambda1, lambda2,
3668 ((float4*)diag.data()) + offsetd,
3669 ((float4*)blocks.data()) + offsetb, 0);
3671 jtjd_point_block_noj_kernel<bsize2, false, false><<<grid2, block2>>>(
3672 len2, (bw * bsize2), lambda1, lambda2,
3673 ((float4*)diag.data()) + offsetd,
3674 ((float4*)blocks.data()) + offsetb, 0);
3677 CheckErrorCUDA("ComputeDiagonalBlock_<Point>");
3679 ////////////////////////////////////////////////////
3681 const unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH;
3682 unsigned int len3 = ncam * 8;
3683 unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3;
3684 dim3 grid3(nblock3), block3(bsize3);
3685 if (radial_distortion)
3686 jtjd_cam_block_invert_kernel<8><<<grid3, block3>>>(
3687 len3, (float4*)blocks.data());
3689 jtjd_cam_block_invert_kernel<7><<<grid3, block3>>>(
3690 len3, (float4*)blocks.data());
3691 CheckErrorCUDA("ComputeDiagonalBlockInverse<Camera>");
3695 __global__ void projection_q_kernel(int nproj, int rowsz, float2* pj) {
3696 ////////////////////////////////
3697 int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
3698 if (tidx >= nproj) return;
3699 int2 proj = tex1Dfetch(tex_projection_idx, tidx);
3700 float2 wq = tex1Dfetch(tex_projection_mea, tidx);
3701 ///////////////////////////////////
3702 float f1 = tex1Dfetch(tex_projection_cam, proj.x * 4).x;
3703 float r1 = tex1Dfetch(tex_projection_cam, proj.x * 4 + 3).w;
3704 float f2 = tex1Dfetch(tex_projection_cam, proj.y * 4).x;
3705 float r2 = tex1Dfetch(tex_projection_cam, proj.y * 4 + 3).w;
3706 pj[tidx] = make_float2(-wq.x * (f1 - f2), -wq.y * (r1 - r2));
3709 void ProgramCU::ComputeProjectionQ(CuTexImage& camera, CuTexImage& qmap,
3710 CuTexImage& qw, CuTexImage& proj,
3712 ///////////////////////////////////////
3713 unsigned int len = qmap.GetImgWidth();
3714 unsigned int bsize = PROJECTION_FRT_KWIDTH;
3715 unsigned int nblock = (len + bsize - 1) / bsize;
3716 unsigned int bw, bh;
3717 GetBlockConfiguration(nblock, bw, bh);
3718 dim3 grid(bw, bh), block(bsize);
3720 ///////////////////////////////////////////
3721 PBA_BIND_TEX1D(tex_projection_cam, camera, cudaReadModeElementType, PBA_ChanFloat4());
3722 PBA_BIND_TEX1D(tex_projection_idx, qmap, cudaReadModeElementType, PBA_ChanInt2());
3723 PBA_BIND_TEX1D(tex_projection_mea, qw, cudaReadModeElementType, PBA_ChanFloat2());
3725 //////////////////////////////
3726 projection_q_kernel<<<grid, block>>>(len, bw * bsize,
3727 ((float2*)proj.data()) + offset);
3731 __global__ void multiply_jqx_kernel(int num, int bwidth, float2* result) {
3732 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3733 if (index >= num) return;
3734 ////////////////////////////////////////////
3735 int2 proj = tex1Dfetch(tex_mjx_idx, index);
3736 float2 wq = tex1Dfetch(tex_jacobian_meas, index);
3737 int idx1 = proj.x * 2, idx2 = proj.y * 2;
3738 float x11 = tex1Dfetch(tex_mjx_x, idx1).x;
3739 float x17 = tex1Dfetch(tex_mjx_x, idx1 + 1).w;
3740 float x21 = tex1Dfetch(tex_mjx_x, idx2).x;
3741 float x27 = tex1Dfetch(tex_mjx_x, idx2 + 1).w;
3744 float s11 = tex1Dfetch(tex_jacobian_sj, idx1).x;
3745 float s17 = tex1Dfetch(tex_jacobian_sj, idx1 + 1).w;
3746 float s21 = tex1Dfetch(tex_jacobian_sj, idx2).x;
3747 float s27 = tex1Dfetch(tex_jacobian_sj, idx2 + 1).w;
3748 result[index] = make_float2((x11 * s11 - x21 * s21) * wq.x,
3749 (x17 * s17 - x27 * s27) * wq.y);
3751 result[index] = make_float2((x11 - x21) * wq.x, (x17 - x27) * wq.y);
3755 void ProgramCU::ComputeJQX(CuTexImage& x, CuTexImage& qmap, CuTexImage& wq,
3756 CuTexImage& sj, CuTexImage& jx, int offset) {
3757 unsigned int nproj = qmap.GetImgWidth();
3758 unsigned int len = nproj;
3759 unsigned int bsize = 64;
3760 unsigned int nblock = (len + bsize - 1) / bsize;
3761 unsigned int bw, bh;
3763 /////////////////////////////
3764 PBA_BIND_TEX1D(tex_mjx_idx, qmap, cudaReadModeElementType, PBA_ChanInt2());
3765 PBA_BIND_TEX1D(tex_mjx_x, x, cudaReadModeElementType, PBA_ChanFloat4());
3766 PBA_BIND_TEX1D(tex_jacobian_meas, wq, cudaReadModeElementType, PBA_ChanFloat2());
3768 ///////////////////////////////////
3769 GetBlockConfiguration(nblock, bw, bh);
3770 dim3 grid(bw, bh), block(bsize);
3773 PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3774 multiply_jqx_kernel<true><<<grid, block>>>(len, (bw * bsize),
3775 ((float2*)jx.data()) + offset);
3777 multiply_jqx_kernel<false><<<grid, block>>>(len, (bw * bsize),
3778 ((float2*)jx.data()) + offset);
3785 __global__ void jte_cam_q_kernel(int num, int bwidth, float* jte) {
3786 // int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz ;
3787 int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
3788 if (index >= num) return;
3789 int2 indexp = tex1Dfetch(tex_jte_q_idx, index);
3790 if (indexp.x == -1) return;
3791 float2 wq = tex1Dfetch(tex_jte_q_w, index);
3792 float2 e1 = tex1Dfetch(tex_jte_pe, indexp.x);
3793 float2 e2 = tex1Dfetch(tex_jte_pe, indexp.y);
3794 int index8 = index << 3;
3796 float s1 = tex1Dfetch(tex_jacobian_sj, index * 2).x;
3797 jte[index8] += s1 * wq.x * (e1.x - e2.x);
3798 float s7 = tex1Dfetch(tex_jacobian_sj, index * 2 + 1).w;
3799 jte[index8 + 7] += s7 * wq.y * (e1.y - e2.y);
3801 jte[index8] += wq.x * (e1.x - e2.x);
3802 jte[index8 + 7] += wq.y * (e1.y - e2.y);
3806 void ProgramCU::ComputeJQtEC(CuTexImage& pe, CuTexImage& qlist, CuTexImage& wq,
3807 CuTexImage& sj, CuTexImage& jte) {
3808 int ncam = qlist.GetImgWidth();
3809 const int bsize = 32;
3810 int nblock = (ncam + bsize - 1) / bsize;
3811 unsigned int bw, bh;
3812 GetBlockConfiguration(nblock, bw, bh);
3813 dim3 grid(bw, bh), block(bsize);
3815 PBA_BIND_TEX1D(tex_jte_pe, pe, cudaReadModeElementType, PBA_ChanFloat2());
3816 PBA_BIND_TEX1D(tex_jte_q_idx, qlist, cudaReadModeElementType, PBA_ChanInt2());
3817 PBA_BIND_TEX1D(tex_jte_q_w, wq, cudaReadModeElementType, PBA_ChanFloat2());
3820 PBA_BIND_TEX1D(tex_jacobian_sj, sj, cudaReadModeElementType, PBA_ChanFloat4());
3821 jte_cam_q_kernel<true><<<grid, block>>>(ncam, (bw * bsize), jte.data());
3823 jte_cam_q_kernel<false><<<grid, block>>>(ncam, (bw * bsize), jte.data());