1 ////////////////////////////////////////////////////////////////////////////
3 // Author: Changchang Wu
4 // Description : implementation of ProgramCU and all CUDA kernels
6 // Copyright (c) 2007 University of North Carolina at Chapel Hill
9 // Permission to use, copy, modify and distribute this software and its
10 // documentation for educational, research and non-profit purposes, without
11 // fee, and without a written agreement is hereby granted, provided that the
12 // above copyright notice and the following paragraph appear in all copies.
14 // The University of North Carolina at Chapel Hill make no representations
15 // about the suitability of this software for any purpose. It is provided
16 // 'as is' without express or implied warranty.
18 // Please send BUG REPORTS to ccwu@cs.unc.edu
20 ////////////////////////////////////////////////////////////////////////////
22 #if defined(SIFTGPU_CUDA_ENABLED)
27 #include "CuTexImage.h"
28 #include "ProgramCU.h"
29 #include "GlobalUtil.h"
31 //----------------------------------------------------------------
32 //Begin SiftGPU setting section.
33 //////////////////////////////////////////////////////////
34 #define IMUL(X,Y) __mul24(X,Y)
35 //#define FDIV(X,Y) ((X)/(Y))
36 #define FDIV(X,Y) __fdividef(X,Y)
38 /////////////////////////////////////////////////////////
39 //filter kernel width range (don't change this)
40 #define KERNEL_MAX_WIDTH 33
41 #define KERNEL_MIN_WIDTH 5
43 //////////////////////////////////////////////////////////
44 //horizontal filter block size (32, 64, 128, 256, 512)
45 #define FILTERH_TILE_WIDTH 128
46 //thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)
47 #define FILTERV_BLOCK_WIDTH 16
48 #define FILTERV_BLOCK_HEIGHT 32
49 //The corresponding image patch for a thread block
50 #define FILTERV_PIXEL_PER_THREAD 4
51 #define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH
52 #define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)
55 //////////////////////////////////////////////////////////
56 //thread block size for computing Difference of Gaussian
57 #define DOG_BLOCK_LOG_DIMX 7
58 #define DOG_BLOCK_LOG_DIMY 0
59 #define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX)
60 #define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY)
62 //////////////////////////////////////////////////////////
63 //thread block size for keypoint detection
64 #define KEY_BLOCK_LOG_DIMX 3
65 #define KEY_BLOCK_LOG_DIMY 3
66 #define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)
67 #define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)
68 //#define KEY_OFFSET_ONE
69 //make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..
70 //but it seems uncoalesced writes don't affect the speed
72 //////////////////////////////////////////////////////////
73 //thread block size for initializing list generation (64, 128, 256, 512 ...)
74 #define HIST_INIT_WIDTH 128
75 //thread block size for generating feature list (32, 64, 128, 256, 512, ...)
76 #define LISTGEN_BLOCK_DIM 128
79 /////////////////////////////////////////////////////////
80 //how many keypoint orientations to compute in a block
81 #define ORIENTATION_COMPUTE_PER_BLOCK 64
82 //how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)
83 #define DESCRIPTOR_COMPUTE_PER_BLOCK 4
84 #define DESCRIPTOR_COMPUTE_BLOCK_SIZE (16 * DESCRIPTOR_COMPUTE_PER_BLOCK)
85 //how many keypoint descriptor to normalized in a block (32, ...)
86 #define DESCRIPTOR_NORMALIZ_PER_BLOCK 32
90 ///////////////////////////////////////////
91 //Thread block size for visualization
92 //(This doesn't affect the speed of computation)
93 #define BLOCK_LOG_DIM 4
94 #define BLOCK_DIM (1 << BLOCK_LOG_DIM)
96 //End SiftGPU setting section.
97 //----------------------------------------------------------------
100 __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
102 const static cudaTextureDesc texDataDesc = []() {
103 cudaTextureDesc textureDesc;
104 memset(&textureDesc, 0, sizeof(textureDesc));
105 textureDesc.readMode = cudaReadModeElementType;
106 textureDesc.addressMode[0] = cudaAddressModeClamp;
107 textureDesc.addressMode[1] = cudaAddressModeClamp;
108 textureDesc.addressMode[2] = cudaAddressModeClamp;
109 textureDesc.filterMode = cudaFilterModePoint;
110 textureDesc.normalizedCoords = false;
114 const static cudaTextureDesc texDataBDesc = []() {
115 cudaTextureDesc textureDesc;
116 memset(&textureDesc, 0, sizeof(textureDesc));
117 textureDesc.readMode = cudaReadModeNormalizedFloat;
118 textureDesc.addressMode[0] = cudaAddressModeClamp;
119 textureDesc.addressMode[1] = cudaAddressModeClamp;
120 textureDesc.addressMode[2] = cudaAddressModeClamp;
121 textureDesc.filterMode = cudaFilterModePoint;
122 textureDesc.normalizedCoords = false;
126 //////////////////////////////////////////////////////////////
127 template<int FW> __global__ void FilterH(cudaTextureObject_t texData, float* d_result, int width)
130 const int HALF_WIDTH = FW >> 1;
131 const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;
132 const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;
133 __shared__ float data[CACHE_WIDTH];
134 const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);
135 const int col = bcol + threadIdx.x;
136 const int index_min = IMUL(blockIdx.y, width);
137 const int index_max = index_min + width - 1;
138 int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
139 int cache_index = threadIdx.x;
142 for(int j = 0; j < CACHE_COUNT; ++j)
144 if(cache_index < CACHE_WIDTH)
146 int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
147 data[cache_index] = tex1Dfetch<float>(texData,fetch_index);
148 src_index += FILTERH_TILE_WIDTH;
149 cache_index += FILTERH_TILE_WIDTH;
153 if(col >= width) return;
155 for(int i = 0; i < FW; ++i)
157 value += (data[threadIdx.x + i]* d_kernel[i]);
159 // value = Conv<FW-1>(data + threadIdx.x);
160 d_result[index_min + col] = value;
165 ////////////////////////////////////////////////////////////////////
166 template<int FW> __global__ void FilterV(cudaTextureObject_t texData, float* d_result, int width, int height)
168 const int HALF_WIDTH = FW >> 1;
169 const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
170 const int TEMP = CACHE_WIDTH & 0xf;
171 //add some extra space to avoid bank conflict
172 #if FILTERV_TILE_WIDTH == 16
173 //make the stride 16 * n +/- 1
174 const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;
175 #elif FILTERV_TILE_WIDTH == 8
176 //make the stride 16 * n +/- 2
177 const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);
178 #elif FILTERV_TILE_WIDTH == 4
179 //make the stride 16 * n +/- 4
180 const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);
184 const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;
185 const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT;
186 const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT;
187 __shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
188 const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);
189 const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;
190 const int row_first = row_block_first - HALF_WIDTH;
191 const int data_index_max = IMUL(height - 1, width) + col;
192 const int cache_col_start = threadIdx.y;
193 const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);
194 int cache_index = cache_col_start + cache_row_start;
195 int data_index = IMUL(row_first + cache_col_start, width) + col;
200 for(int i = 0; i < CACHE_COUNT; ++i)
202 if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
204 int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
205 data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch<float>(texData,fetch_index);
206 data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);
212 if(col >= width) return;
214 int row = row_block_first + threadIdx.y;
215 int index_start = cache_row_start + threadIdx.y;
217 for(int i = 0; i < WRITE_COUNT; ++i,
218 row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT)
222 int index_dest = IMUL(row, width) + col;
225 for(int i = 0; i < FW; ++i)
227 value += (data[index_start + i] * d_kernel[i]);
229 d_result[index_dest] = value;
235 template<int LOG_SCALE> __global__ void UpsampleKernel(cudaTextureObject_t texData, float* d_result, int width)
237 const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
238 const float INV_SCALE = 1.0f / (float(SCALE));
239 int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
240 if(col >= width) return;
242 int row = blockIdx.y >> LOG_SCALE;
243 int index = row * width + col;
244 int dst_row = blockIdx.y;
245 int dst_idx= (width * dst_row + col) * SCALE;
246 int helper = blockIdx.y & SCALE_MASK;
249 float v11 = tex1Dfetch<float>(texData, index);
250 float v12 = tex1Dfetch<float>(texData, index + 1);
252 float v21 = tex1Dfetch<float>(texData, index);
253 float v22 = tex1Dfetch<float>(texData, index + 1);
254 float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
255 float v1 = (v21 * w1 + w2 * v11);
256 float v2 = (v22 * w1 + w2 * v12);
257 d_result[dst_idx] = v1;
259 for(int i = 1; i < SCALE; ++i)
261 const float r2 = i * INV_SCALE;
262 const float r1 = 1.0f - r2;
263 d_result[dst_idx +i] = v1 * r1 + v2 * r2;
267 float v1 = tex1Dfetch<float>(texData, index);
268 float v2 = tex1Dfetch<float>(texData, index + 1);
269 d_result[dst_idx] = v1;
271 for(int i = 1; i < SCALE; ++i)
273 const float r2 = i * INV_SCALE;
274 const float r1 = 1.0f - r2;
275 d_result[dst_idx +i] = v1 * r1 + v2 * r2;
281 ////////////////////////////////////////////////////////////////////////////////////////
282 void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
284 int width = src->GetImgWidth(), height = src->GetImgHeight();
285 CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
286 dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
287 dim3 block(FILTERH_TILE_WIDTH);
290 case 1 : UpsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
291 case 2 : UpsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
292 case 3 : UpsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
297 template<int LOG_SCALE> __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width)
299 const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
300 if(dst_col >= dst_width) return;
301 const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));
302 const int dst_row = blockIdx.y;
303 const int src_row = blockIdx.y << LOG_SCALE;
304 const int src_idx = IMUL(src_row, src_width) + src_col;
305 const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
306 d_result[dst_idx] = tex1Dfetch<float>(texData, src_idx);
310 __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width, const int log_scale)
312 const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
313 if(dst_col >= dst_width) return;
314 const int src_col = min((dst_col << log_scale), (src_width - 1));
315 const int dst_row = blockIdx.y;
316 const int src_row = blockIdx.y << log_scale;
317 const int src_idx = IMUL(src_row, src_width) + src_col;
318 const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
319 d_result[dst_idx] = tex1Dfetch<float>(texData, src_idx);
323 void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
325 int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
327 CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
328 dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
329 dim3 block(FILTERH_TILE_WIDTH);
332 case 1 : DownsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
333 case 2 : DownsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
334 case 3 : DownsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
335 default: DownsampleKernel <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width, log_scale);
339 __global__ void ChannelReduce_Kernel(cudaTextureObject_t texData, float* d_result)
341 int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
342 d_result[index] = tex1Dfetch<float>(texData, index*4);
345 __global__ void ChannelReduce_Convert_Kernel(cudaTextureObject_t texDataF4, float* d_result)
347 int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
348 float4 rgba = tex1Dfetch<float4>(texDataF4, index);
349 d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
352 void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)
354 int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
356 dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
357 dim3 block(FILTERH_TILE_WIDTH);
360 CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
361 ChannelReduce_Convert_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
364 CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
365 ChannelReduce_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
369 __global__ void ConvertByteToFloat_Kernel(cudaTextureObject_t texDataB, float* d_result)
371 int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
372 d_result[index] = tex1Dfetch<float>(texDataB, index);
375 void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
377 int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
378 dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
379 dim3 block(FILTERH_TILE_WIDTH);
380 CuTexImage::CuTexObj srcTex = src->BindTexture(texDataBDesc, cudaCreateChannelDesc<float>());
381 ConvertByteToFloat_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
384 void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
386 int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
389 if(width > KERNEL_MAX_WIDTH)
391 //filter size truncation
392 sz = KERNEL_MAX_WIDTH >> 1;
393 width =KERNEL_MAX_WIDTH;
394 }else if(width < KERNEL_MIN_WIDTH)
396 sz = KERNEL_MIN_WIDTH >> 1;
397 width =KERNEL_MIN_WIDTH;
400 float rv = 1.0f/(sigma*sigma), v, ksum =0;
402 // pre-compute filter
403 for( i = -sz ; i <= sz ; ++i)
405 kernel[i+sz] = v = exp(-0.5f * i * i *rv) ;
409 //normalize the kernel
411 for(i = 0; i< width ;i++) kernel[i]*=rv;
415 template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)
417 int width = src->GetImgWidth(), height = src->GetImgHeight();
419 //horizontal filtering
420 CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
421 dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
422 dim3 blockh(FILTERH_TILE_WIDTH);
423 FilterH<FW><<<gridh, blockh>>>(srcTex.handle, (float*)buf->_cuData, width);
424 CheckErrorCUDA("FilterH");
426 ///vertical filtering
427 CuTexImage::CuTexObj bufTex = buf->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
428 dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
429 dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);
430 FilterV<FW><<<gridv, blockv>>>(bufTex.handle, (float*)dst->_cuData, width, height);
431 CheckErrorCUDA("FilterV");
434 //////////////////////////////////////////////////////////////////////
435 // tested on 2048x1500 image, the time on pyramid construction is
436 // OpenGL version : 18ms
437 // CUDA version: 28 ms
438 void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)
440 float filter_kernel[KERNEL_MAX_WIDTH]; int width;
441 CreateFilterKernel(sigma, filter_kernel, width);
442 cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);
446 case 5: FilterImage< 5>(dst, src, buf); break;
447 case 7: FilterImage< 7>(dst, src, buf); break;
448 case 9: FilterImage< 9>(dst, src, buf); break;
449 case 11: FilterImage<11>(dst, src, buf); break;
450 case 13: FilterImage<13>(dst, src, buf); break;
451 case 15: FilterImage<15>(dst, src, buf); break;
452 case 17: FilterImage<17>(dst, src, buf); break;
453 case 19: FilterImage<19>(dst, src, buf); break;
454 case 21: FilterImage<21>(dst, src, buf); break;
455 case 23: FilterImage<23>(dst, src, buf); break;
456 case 25: FilterImage<25>(dst, src, buf); break;
457 case 27: FilterImage<27>(dst, src, buf); break;
458 case 29: FilterImage<29>(dst, src, buf); break;
459 case 31: FilterImage<31>(dst, src, buf); break;
460 case 33: FilterImage<33>(dst, src, buf); break;
467 void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, float2* d_got, int width, int height)
469 int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
470 int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
471 if(col < width && row < height)
473 int index = IMUL(row, width) + col;
474 float vp = tex1Dfetch<float>(texP, index);
475 float v = tex1Dfetch<float>(texC, index);
476 d_dog[index] = v - vp;
477 float vxn = tex1Dfetch<float>(texC, index + 1);
478 float vxp = tex1Dfetch<float>(texC, index - 1);
479 float vyp = tex1Dfetch<float>(texC, index - width);
480 float vyn = tex1Dfetch<float>(texC, index + width);
481 float dx = vxn - vxp, dy = vyn - vyp;
482 float grd = 0.5f * sqrt(dx * dx + dy * dy);
483 float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
484 d_got[index] = make_float2(grd, rot);
488 void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, int width, int height)
490 int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
491 int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
492 if(col < width && row < height)
494 int index = IMUL(row, width) + col;
495 float vp = tex1Dfetch<float>(texP, index);
496 float v = tex1Dfetch<float>(texC, index);
497 d_dog[index] = v - vp;
501 void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
503 int width = gus->GetImgWidth(), height = gus->GetImgHeight();
504 dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
505 dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
506 CuTexImage::CuTexObj texCObj = gus->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
507 CuTexImage::CuTexObj texPObj = (gus-1)->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
509 ComputeDOG_Kernel<<<grid, block>>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, (float2*) got->_cuData, width, height);
511 ComputeDOG_Kernel<<<grid, block>>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, width, height);
515 #define READ_CMP_DOG_DATA(datai, tex, idx) \
516 datai[0] = tex1Dfetch<float>(tex, idx - 1);\
517 datai[1] = tex1Dfetch<float>(tex, idx);\
518 datai[2] = tex1Dfetch<float>(tex, idx + 1);\
521 nmax = max(nmax, datai[0]);\
522 nmax = max(nmax, datai[1]);\
523 nmax = max(nmax, datai[2]);\
524 if(v < nmax) goto key_finish;\
527 nmin = min(nmin, datai[0]);\
528 nmin = min(nmin, datai[1]);\
529 nmin = min(nmin, datai[2]);\
530 if(v > nmin) goto key_finish;\
534 void __global__ ComputeKEY_Kernel(cudaTextureObject_t texP, cudaTextureObject_t texC, cudaTextureObject_t texN, float4* d_key, int width, int colmax, int rowmax,
535 float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization)
538 float datap[3][3], datan[3][3];
539 #ifdef KEY_OFFSET_ONE
540 int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;
541 int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;
543 int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;
544 int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;
546 int index = IMUL(row, width) + col;
547 int idx[3] ={index - width, index, index + width};
549 float nmax, nmin, result = 0.0f;
550 float dx = 0, dy = 0, ds = 0;
551 bool offset_test_passed = true;
552 #ifdef KEY_OFFSET_ONE
553 if(row < rowmax && col < colmax)
555 if(row > 0 && col > 0 && row < rowmax && col < colmax)
559 data[1][1] = v = tex1Dfetch<float>(texC, idx[1]);
560 if(fabs(v) <= dog_threshold0) goto key_finish;
562 data[1][0] = tex1Dfetch<float>(texC, idx[1] - 1);
563 data[1][2] = tex1Dfetch<float>(texC, idx[1] + 1);
564 nmax = max(data[1][0], data[1][2]);
565 nmin = min(data[1][0], data[1][2]);
567 if(v <=nmax && v >= nmin) goto key_finish;
568 //if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
569 READ_CMP_DOG_DATA(data[0], texC, idx[0]);
570 READ_CMP_DOG_DATA(data[2], texC, idx[2]);
573 float vx2 = v * 2.0f;
574 float fxx = data[1][0] + data[1][2] - vx2;
575 float fyy = data[0][1] + data[2][1] - vx2;
576 float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);
577 float temp1 = fxx * fyy - fxy * fxy;
578 float temp2 = (fxx + fyy) * (fxx + fyy);
579 if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;
582 //read the previous level
583 READ_CMP_DOG_DATA(datap[0], texP, idx[0]);
584 READ_CMP_DOG_DATA(datap[1], texP, idx[1]);
585 READ_CMP_DOG_DATA(datap[2], texP, idx[2]);
588 //read the next level
589 READ_CMP_DOG_DATA(datan[0], texN, idx[0]);
590 READ_CMP_DOG_DATA(datan[1], texN, idx[1]);
591 READ_CMP_DOG_DATA(datan[2], texN, idx[2]);
593 if(subpixel_localization)
595 //subpixel localization
596 float fx = 0.5f * (data[1][2] - data[1][0]);
597 float fy = 0.5f * (data[2][1] - data[0][1]);
598 float fs = 0.5f * (datan[1][1] - datap[1][1]);
600 float fss = (datan[1][1] + datap[1][1] - vx2);
601 float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);
602 float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);
604 //need to solve dx, dy, ds;
605 // |-fx| | fxx fxy fxs | |dx|
606 // |-fy| = | fxy fyy fys | * |dy|
607 // |-fs| | fxs fys fss | |ds|
608 float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
609 float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
610 float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
611 float maxa = max(max(A0.x, A1.x), A2.x);
616 float4 TEMP = A1; A1 = A0; A0 = TEMP;
617 }else if(maxa == A2.x)
619 float4 TEMP = A2; A2 = A0; A0 = TEMP;
621 A0.y /= A0.x; A0.z /= A0.x; A0.w/= A0.x;
622 A1.y -= A1.x * A0.y; A1.z -= A1.x * A0.z; A1.w -= A1.x * A0.w;
623 A2.y -= A2.x * A0.y; A2.z -= A2.x * A0.z; A2.w -= A2.x * A0.w;
624 if(abs(A2.y) > abs(A1.y))
626 float4 TEMP = A2; A2 = A1; A1 = TEMP;
628 if(abs(A1.y) >= 1e-10)
630 A1.z /= A1.y; A1.w /= A1.y;
631 A2.z -= A2.y * A1.z; A2.w -= A2.y * A1.w;
632 if(abs(A2.z) >= 1e-10)
635 dy = A1.w - ds * A1.z;
636 dx = A0.w - ds * A0.z - dy * A0.y;
639 fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold
640 &&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;
645 if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;
648 if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
652 void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)
654 int width = dog->GetImgWidth(), height = dog->GetImgHeight();
655 float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;
656 CuTexImage* dogp = dog - 1;
657 CuTexImage* dogn = dog + 1;
658 #ifdef KEY_OFFSET_ONE
659 dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
661 dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
663 dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
664 CuTexImage::CuTexObj texPObj = dogp->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
665 CuTexImage::CuTexObj texCObj = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
666 CuTexImage::CuTexObj texNObj = dogn->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
667 Tedge = (Tedge+1)*(Tedge+1)/Tedge;
668 ComputeKEY_Kernel<<<grid, block>>>(texPObj.handle, texCObj.handle, texNObj.handle, (float4*) key->_cuData, width,
669 width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
675 void __global__ InitHist_Kernel(cudaTextureObject_t texDataF4, int4* hist, int ws, int wd, int height)
677 int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
678 int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
679 if(row < height && col < wd)
681 int hidx = IMUL(row, wd) + col;
683 int sidx = IMUL(row, ws) + scol;
684 int v[4] = {0, 0, 0, 0};
685 if(row > 0 && row < height -1)
688 for(int i = 0; i < 4 ; ++i, ++scol)
690 float4 temp = tex1Dfetch<float4>(texDataF4, sidx +i);
691 v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
694 hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
701 void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
703 int ws = key->GetImgWidth(), hs = key->GetImgHeight();
704 int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
705 dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd);
706 dim3 block(HIST_INIT_WIDTH, 1);
707 CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
708 InitHist_Kernel<<<grid, block>>>(keyTex.handle, (int4*) hist->_cuData, ws, wd, hd);
713 void __global__ ReduceHist_Kernel(cudaTextureObject_t texDataI4, int4* d_hist, int ws, int wd, int height)
715 int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
716 int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
717 if(row < height && col < wd)
719 int hidx = IMUL(row, wd) + col;
721 int sidx = IMUL(row, ws) + scol;
722 int v[4] = {0, 0, 0, 0};
724 for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
726 int4 temp = tex1Dfetch<int4>(texDataI4, sidx + i);
727 v[i] = temp.x + temp.y + temp.z + temp.w;
729 d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
733 void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
735 int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
736 int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
737 int temp = (int)floorf(logf(float(wd * 2/ 3)) / logf(2.0f));
738 const int wi = min(7, max(temp , 0));
739 CuTexImage::CuTexObj hist1Tex = hist1->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
741 const int BW = 1 << wi, BH = 1 << (7 - wi);
742 dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH);
744 ReduceHist_Kernel<<<grid, block>>>(hist1Tex.handle, (int4*)hist2->_cuData, ws, wd, hd);
748 void __global__ ListGen_Kernel(cudaTextureObject_t texDataList, cudaTextureObject_t texDataI4, int4* d_list, int list_len, int width)
750 int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
751 int4 pos = tex1Dfetch<int4>(texDataList, idx1);
752 int idx2 = IMUL(pos.y, width) + pos.x;
753 int4 temp = tex1Dfetch<int4>(texDataI4, idx2);
754 int sum1 = temp.x + temp.y;
755 int sum2 = sum1 + temp.z;
761 }else if(pos.z >= sum1)
765 }else if(pos.z >= temp.x)
770 if (idx1 < list_len) {
775 //input list (x, y) (x, y) ....
776 void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
778 int len = list->GetImgWidth();
779 CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
780 CuTexImage::CuTexObj histTex = hist->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
781 dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
782 dim3 block(LISTGEN_BLOCK_DIM);
783 ListGen_Kernel<<<grid, block>>>(listTex.handle, histTex.handle, (int4*) list->_cuData, len,
784 hist->GetImgWidth());
787 void __global__ ComputeOrientation_Kernel(cudaTextureObject_t texDataF2,
788 cudaTextureObject_t texDataF4,
789 cudaTextureObject_t texDataList,
792 int width, int height,
793 float sigma, float sigma_step,
794 float gaussian_factor, float sample_factor,
796 int existing_keypoint,
800 const float ten_degree_per_radius = 5.7295779513082320876798154814105;
801 const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;
802 int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
803 if(idx >= list_len) return;
805 if(existing_keypoint)
807 key = tex1Dfetch<float4>(texDataF4, idx);
810 int4 ikey = tex1Dfetch<int4>(texDataList, idx);
811 key.x = ikey.x + 0.5f;
812 key.y = ikey.y + 0.5f;
814 if(subpixel || keepsign)
816 float4 offset = tex1Dfetch<float4>(texDataF4, IMUL(width, ikey.y) + ikey.x);
821 key.z *= pow(sigma_step, offset.w);
823 if(keepsign) key.z *= offset.x;
826 if(num_orientation == 0)
833 float gsigma = key.z * gaussian_factor;
834 float win = fabs(key.z) * sample_factor;
835 float dist_threshold = win * win + 0.5;
836 float factor = -0.5f / (gsigma * gsigma);
837 float xmin = max(1.5f, floorf(key.x - win) + 0.5f);
838 float ymin = max(1.5f, floorf(key.y - win) + 0.5f);
839 float xmax = min(width - 1.5f, floorf(key.x + win) + 0.5f);
840 float ymax = min(height -1.5f, floorf(key.y + win) + 0.5f);
842 for(int i = 0; i < 36; ++i) vote[i] = 0.0f;
843 for(float y = ymin; y <= ymax; y += 1.0f)
845 for(float x = xmin; x <= xmax; x += 1.0f)
847 float dx = x - key.x;
848 float dy = y - key.y;
849 float sq_dist = dx * dx + dy * dy;
850 if(sq_dist >= dist_threshold) continue;
851 float2 got = tex2D<float2>(texDataF2, x, y);
852 float weight = got.x * exp(sq_dist * factor);
853 float fidx = floorf(got.y * ten_degree_per_radius);
855 if(oidx < 0) oidx += 36;
856 vote[oidx] += weight;
862 const float one_third = 1.0 /3.0;
864 for(int i = 0; i < 6; ++i)
867 float pre = vote[35];
869 for(int j = 0; j < 36; ++j)
871 float temp = one_third * (pre + vote[j] + vote[j + 1]);
872 pre = vote[j]; vote[j] = temp;
877 if(num_orientation == 1 || existing_keypoint)
880 float max_vote = vote[0];
882 for(int i = 1; i < 36; ++i)
884 index_max = vote[i] > max_vote? i : index_max;
885 max_vote = max(max_vote, vote[i]);
887 float pre = vote[index_max == 0? 35 : index_max -1];
888 float next = vote[index_max + 1];
889 float weight = max_vote;
890 float off = 0.5f * FDIV(next - pre, weight + weight - next - pre);
891 key.w = radius_per_ten_degrees * (index_max + 0.5f + off);
896 float max_vote = vote[0];
898 for(int i = 1; i < 36; ++i) max_vote = max(max_vote, vote[i]);
900 float vote_threshold = max_vote * 0.8f;
901 float pre = vote[35];
902 float max_rot[2], max_vot[2] = {0, 0};
905 for(int i =0; i < 36; ++i)
907 float next = vote[i + 1];
908 if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)
910 float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre);
911 float rot = i + di + 0.5f;
912 float weight = vote[i];
914 if(weight > max_vot[1])
916 if(weight > max_vot[0])
918 max_vot[1] = max_vot[0];
919 max_rot[1] = max_rot[0];
933 float fr1 = max_rot[0] / 36.0f;
934 if(fr1 < 0) fr1 += 1.0f;
935 unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floorf(fr1 * 65535.0f));
936 unsigned short us2 = 65535;
939 float fr2 = max_rot[1] / 36.0f;
940 if(fr2 < 0) fr2 += 1.0f;
941 us2 = (unsigned short ) floorf(fr2 * 65535.0f);
943 unsigned int uspack = (us2 << 16) | us1;
944 key.w = __int_as_float(uspack);
953 void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key,
954 float sigma, float sigma_step, int existing_keypoint)
956 int len = list->GetImgWidth();
958 int width = got->GetImgWidth(), height = got->GetImgHeight();
959 CuTexImage::CuTexObj texObjF4;
960 CuTexImage::CuTexObj texObjList;
961 if(existing_keypoint)
963 texObjF4 = list->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
966 texObjList = list->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
967 if(GlobalUtil::_SubpixelLocalization)
969 texObjF4 = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
973 CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc<float2>());
975 const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK;
976 dim3 grid((len + block_width -1) / block_width);
977 dim3 block(block_width);
979 ComputeOrientation_Kernel<<<grid, block>>>(
983 (float4*) list->_cuData,
984 len, width, height, sigma, sigma_step,
985 GlobalUtil::_OrientationGaussianFactor,
986 GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor,
987 GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation,
988 existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign);
990 ProgramCU::CheckErrorCUDA("ComputeOrientation");
993 template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num,
994 int width, int height, float window_factor)
996 const float rpi = 4.0/ 3.14159265358979323846;
997 int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
999 if(fidx >= num) return;
1000 float4 key = tex1Dfetch<float4>(texDataF4, fidx);
1001 int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
1002 float spt = fabs(key.z * window_factor);
1003 float s, c; __sincosf(key.w, &s, &c);
1004 float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ;
1005 float cspt = c * spt, sspt = s * spt;
1006 float crspt = c / spt, srspt = s / spt;
1007 float2 offsetpt, pt;
1008 float xmin, ymin, xmax, ymax, bsz;
1009 offsetpt.x = ix - 1.5f;
1010 offsetpt.y = iy - 1.5f;
1011 pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x;
1012 pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y;
1013 bsz = fabs(cspt) + fabs(sspt);
1014 xmin = max(1.5f, floorf(pt.x - bsz) + 0.5f);
1015 ymin = max(1.5f, floorf(pt.y - bsz) + 0.5f);
1016 xmax = min(width - 1.5f, floorf(pt.x + bsz) + 0.5f);
1017 ymax = min(height - 1.5f, floorf(pt.y + bsz) + 0.5f);
1020 for(int i =0; i < 9; ++i) des[i] = 0.0f;
1021 for(float y = ymin; y <= ymax; y += 1.0f)
1023 for(float x = xmin; x <= xmax; x += 1.0f)
1025 float dx = x - pt.x;
1026 float dy = y - pt.y;
1027 float nx = crspt * dx + srspt * dy;
1028 float ny = crspt * dy - srspt * dx;
1029 float nxn = fabs(nx);
1030 float nyn = fabs(ny);
1031 if(nxn < 1.0f && nyn < 1.0f)
1033 float2 cc = tex2D<float2>(texDataF2, x, y);
1034 float dnx = nx + offsetpt.x;
1035 float dny = ny + offsetpt.y;
1036 float ww = exp(-0.125f * (dnx * dnx + dny * dny));
1037 float wx = 1.0 - nxn;
1038 float wy = 1.0 - nyn;
1039 float weight = ww * wx * wy * cc.x;
1040 float theta = (anglef - cc.y) * rpi;
1041 if(theta < 0) theta += 8.0f;
1042 float fo = floorf(theta);
1044 float weight1 = fo + 1.0f - theta;
1045 float weight2 = theta - fo;
1046 if(DYNAMIC_INDEXING)
1048 des[fidx] += (weight1 * weight);
1049 des[fidx + 1] += (weight2 * weight);
1050 //this dynamic indexing part might be slow
1054 for(int k = 0; k < 8; ++k)
1058 des[k] += (weight1 * weight);
1059 des[k+1] += (weight2 * weight);
1068 int didx = idx << 1;
1069 d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1070 d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1074 template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num,
1075 int width, int height, float window_factor)
1077 const float rpi = 4.0/ 3.14159265358979323846;
1078 int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1079 int fidx = idx >> 4;
1080 if(fidx >= num) return;
1081 float4 key = tex1Dfetch<float4>(texDataF4, fidx);
1082 int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
1083 //float aspect_ratio = key.w / key.z;
1084 //float aspect_sq = aspect_ratio * aspect_ratio;
1085 float sptx = key.z * 0.25, spty = key.w * 0.25;
1086 float xmin, ymin, xmax, ymax; float2 pt;
1087 pt.x = sptx * (ix + 0.5f) + key.x;
1088 pt.y = spty * (iy + 0.5f) + key.y;
1089 xmin = max(1.5f, floorf(pt.x - sptx) + 0.5f);
1090 ymin = max(1.5f, floorf(pt.y - spty) + 0.5f);
1091 xmax = min(width - 1.5f, floorf(pt.x + sptx) + 0.5f);
1092 ymax = min(height - 1.5f, floorf(pt.y + spty) + 0.5f);
1095 for(int i =0; i < 9; ++i) des[i] = 0.0f;
1096 for(float y = ymin; y <= ymax; y += 1.0f)
1098 for(float x = xmin; x <= xmax; x += 1.0f)
1100 float nx = (x - pt.x) / sptx;
1101 float ny = (y - pt.y) / spty;
1102 float nxn = fabs(nx);
1103 float nyn = fabs(ny);
1104 if(nxn < 1.0f && nyn < 1.0f)
1106 float2 cc = tex2D<float2>(texDataF2, x, y);
1107 float wx = 1.0 - nxn;
1108 float wy = 1.0 - nyn;
1109 float weight = wx * wy * cc.x;
1110 float theta = (- cc.y) * rpi;
1111 if(theta < 0) theta += 8.0f;
1112 float fo = floorf(theta);
1114 float weight1 = fo + 1.0f - theta;
1115 float weight2 = theta - fo;
1116 if(DYNAMIC_INDEXING)
1118 des[fidx] += (weight1 * weight);
1119 des[fidx + 1] += (weight2 * weight);
1120 //this dynamic indexing part might be slow
1124 for(int k = 0; k < 8; ++k)
1128 des[k] += (weight1 * weight);
1129 des[k+1] += (weight2 * weight);
1138 int didx = idx << 1;
1139 d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1140 d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1143 void __global__ NormalizeDescriptor_Kernel(cudaTextureObject_t texDataF4, float4* d_des, int num)
1146 int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1147 if(idx >= num) return;
1148 int sidx = idx << 5;
1149 float norm1 = 0, norm2 = 0;
1151 for(int i = 0; i < 32; ++i)
1153 temp[i] = tex1Dfetch<float4>(texDataF4, sidx +i);
1154 norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1155 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1157 norm1 = rsqrt(norm1);
1160 for(int i = 0; i < 32; ++i)
1162 temp[i].x = min(0.2f, temp[i].x * norm1);
1163 temp[i].y = min(0.2f, temp[i].y * norm1);
1164 temp[i].z = min(0.2f, temp[i].z * norm1);
1165 temp[i].w = min(0.2f, temp[i].w * norm1);
1166 norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1167 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1170 norm2 = rsqrt(norm2);
1172 for(int i = 0; i < 32; ++i)
1174 temp[i].x *= norm2; temp[i].y *= norm2;
1175 temp[i].z *= norm2; temp[i].w *= norm2;
1176 d_des[sidx + i] = temp[i];
1180 void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream)
1182 int num = list->GetImgWidth();
1183 int width = got->GetImgWidth();
1184 int height = got->GetImgHeight();
1186 dtex->InitTexture(num * 128, 1, 1);
1187 CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc<float2>());
1188 CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1189 int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
1190 dim3 grid((num * 16 + block_width -1) / block_width);
1191 dim3 block(block_width);
1195 if(GlobalUtil::_UseDynamicIndexing)
1196 ComputeDescriptorRECT_Kernel<true><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1198 ComputeDescriptorRECT_Kernel<false><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1202 if(GlobalUtil::_UseDynamicIndexing)
1203 ComputeDescriptor_Kernel<true><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1205 ComputeDescriptor_Kernel<false><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1207 if(GlobalUtil::_NormalizedSIFT)
1209 CuTexImage::CuTexObj dtexTex = dtex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1210 const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK;
1211 dim3 grid((num + block_width -1) / block_width);
1212 dim3 block(block_width);
1213 NormalizeDescriptor_Kernel<<<grid, block>>>(dtexTex.handle, (float4*) dtex->_cuData, num);
1215 CheckErrorCUDA("ComputeDescriptor");
1218 //////////////////////////////////////////////////////
1219 void ProgramCU::FinishCUDA()
1221 cudaDeviceSynchronize();
1224 int ProgramCU::CheckErrorCUDA(const char* location)
1226 cudaError_t e = cudaGetLastError();
1229 if(location) fprintf(stderr, "%s:\t", location);
1230 fprintf(stderr, "%s\n", cudaGetErrorString(e));
1239 void __global__ ConvertDOG_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height)
1241 int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1242 int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1243 if(col < width && row < height)
1245 int index = row * width + col;
1246 float v = tex1Dfetch<float>(texData, index);
1247 d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1248 0.5 : __saturatef(0.5+20.0*v);
1252 void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
1254 if(out->_cuData == NULL) return;
1255 int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
1256 CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1257 dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM);
1258 dim3 block(BLOCK_DIM, BLOCK_DIM);
1259 ConvertDOG_Kernel<<<grid, block>>>(dogTex.handle, (float*) out->_cuData, width, height);
1260 ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
1263 void __global__ ConvertGRD_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height)
1265 int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1266 int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1267 if(col < width && row < height)
1269 int index = row * width + col;
1270 float v = tex1Dfetch<float>(texData, index << 1);
1271 d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1272 0 : __saturatef(5 * v);
1278 void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
1280 if(out->_cuData == NULL) return;
1281 int width = got->GetImgWidth(), height = got ->GetImgHeight();
1282 CuTexImage::CuTexObj gotTex = got->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1283 dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM);
1284 dim3 block(BLOCK_DIM, BLOCK_DIM);
1285 ConvertGRD_Kernel<<<grid, block>>>(gotTex.handle, (float*) out->_cuData, width, height);
1286 ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
1289 void __global__ ConvertKEY_Kernel(cudaTextureObject_t texData, cudaTextureObject_t texDataF4, float4* d_result, int width, int height)
1292 int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1293 int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1294 if(col < width && row < height)
1296 int index = row * width + col;
1297 float4 keyv = tex1Dfetch<float4>(texDataF4, index);
1298 int is_key = (keyv.x == 1.0f || keyv.x == -1.0f);
1299 int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;
1300 float v = inside? __saturatef(0.5 + 20 * tex1Dfetch<float>(texData, index)) : 0.5;
1301 d_result[index] = is_key && inside ?
1302 (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)):
1303 make_float4(v, v, v, 1.0f) ;
1306 void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)
1308 if(out->_cuData == NULL) return;
1309 int width = key->GetImgWidth(), height = key ->GetImgHeight();
1310 CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1311 CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1312 dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM);
1313 dim3 block(BLOCK_DIM, BLOCK_DIM);
1314 ConvertKEY_Kernel<<<grid, block>>>(dogTex.handle, keyTex.handle, (float4*) out->_cuData, width, height);
1318 void __global__ DisplayKeyPoint_Kernel(cudaTextureObject_t texDataF4, float4 * d_result, int num)
1320 int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1321 if(idx >= num) return;
1322 float4 v = tex1Dfetch<float4>(texDataF4, idx);
1323 d_result[idx] = make_float4(v.x, v.y, 0, 1.0f);
1326 void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
1328 int num = ftex->GetImgWidth();
1329 int block_width = 64;
1330 dim3 grid((num + block_width -1) /block_width);
1331 dim3 block(block_width);
1332 CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1333 DisplayKeyPoint_Kernel<<<grid, block>>>(ftexTex.handle, (float4*) out->_cuData, num);
1334 ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
1337 void __global__ DisplayKeyBox_Kernel(cudaTextureObject_t texDataF4, float4* d_result, int num)
1339 int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1340 if(idx >= num) return;
1341 int kidx = idx / 10, vidx = idx - IMUL(kidx , 10);
1342 float4 v = tex1Dfetch<float4>(texDataF4, kidx);
1343 float sz = fabs(v.z * 3.0f);
1344 ///////////////////////
1345 float s, c; __sincosf(v.w, &s, &c);
1346 ///////////////////////
1347 float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz);
1348 float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz);
1350 pos.x = v.x + c * dx - s * dy;
1351 pos.y = v.y + c * dy + s * dx;
1352 pos.z = 0; pos.w = 1.0f;
1353 d_result[idx] = pos;
1356 void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
1358 int len = ftex->GetImgWidth();
1359 int block_width = 32;
1360 dim3 grid((len * 10 + block_width -1) / block_width);
1361 dim3 block(block_width);
1362 CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1363 DisplayKeyBox_Kernel<<<grid, block>>>(ftexTex.handle, (float4*) out->_cuData, len * 10);
1366 int ProgramCU::CheckCudaDevice(int device)
1368 int count = 0, device_used;
1369 if(cudaGetDeviceCount(&count) != cudaSuccess || count <= 0)
1371 ProgramCU::CheckErrorCUDA("CheckCudaDevice");
1373 }else if(count == 1)
1375 cudaDeviceProp deviceProp;
1376 if ( cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess ||
1377 (deviceProp.major == 9999 && deviceProp.minor == 9999))
1379 fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
1383 GlobalUtil::_MemCapGPU = deviceProp.totalGlobalMem / 1024;
1384 GlobalUtil::_texMaxDimGL = 32768;
1385 if(GlobalUtil::_verbose)
1386 fprintf(stdout, "NOTE: changing maximum texture dimension to %d\n", GlobalUtil::_texMaxDimGL);
1390 if(device >0 && device < count)
1392 cudaSetDevice(device);
1393 CheckErrorCUDA("cudaSetDevice\n");
1395 cudaGetDevice(&device_used);
1396 if(device != device_used)
1397 fprintf(stderr, "\nERROR: Cannot set device to %d\n"
1398 "\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);
1402 ////////////////////////////////////////////////////////////////////////////////////////
1403 // siftmatch funtions
1404 //////////////////////////////////////////////////////////////////////////////////////////
1406 #define MULT_TBLOCK_DIMX 128
1407 #define MULT_TBLOCK_DIMY 1
1408 #define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
1409 #define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)
1411 void __global__ MultiplyDescriptor_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2, int* d_result, int num1, int num2, int3* d_temp)
1413 int idx01 = (blockIdx.y * MULT_BLOCK_DIMY), idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
1415 int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
1416 __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1417 int read_idx1 = idx01 * 8 + threadIdx.x, read_idx2 = idx2 * 8;
1418 int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1419 int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1421 ///////////////////////////////////////////////////////////////
1422 //Load feature descriptors
1423 ///////////////////////////////////////////////////////////////
1424 #if MULT_BLOCK_DIMY == 16
1425 uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1426 data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
1427 data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w;
1428 #elif MULT_BLOCK_DIMY == 8
1429 if(threadIdx.x < 64)
1431 uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1432 data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
1433 data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w;
1441 if(idx2 >= num2) return;
1442 ///////////////////////////////////////////////////////////////////////////
1443 //compare descriptors
1445 int results[MULT_BLOCK_DIMY];
1447 for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;
1450 for(int i = 0; i < 8; ++i)
1452 uint4 v = tex1Dfetch<uint4>(texDes2, read_idx2 + i);
1453 unsigned char* p2 = (unsigned char*)(&v);
1455 for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1457 unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4));
1458 results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1])
1459 + IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3])
1460 + IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5])
1461 + IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7])
1462 + IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9])
1463 + IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11])
1464 + IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13])
1465 + IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15]));
1469 int dst_idx = IMUL(idx1, num2) + idx2;
1472 int3 cmp_result = make_int3(0, -1, 0);
1475 for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1479 cmp_result = results[i] > cmp_result.x?
1480 make_int3(results[i], idx1 + i, cmp_result.x) :
1481 make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1482 d_result[dst_idx + IMUL(i, num2)] = results[i];
1485 d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
1489 for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1491 if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
1498 void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
1500 int num1 = des1->GetImgWidth() / 8;
1501 int num2 = des2->GetImgWidth() / 8;
1502 dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1503 (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1504 dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1505 texDot->InitTexture( num2,num1);
1506 if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
1507 CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1508 CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1510 MultiplyDescriptor_Kernel<<<grid, block>>>(des1Tex.handle, des2Tex.handle, (int*)texDot->_cuData, num1, num2,
1511 (texCRT? (int3*)texCRT->_cuData : NULL));
1521 void __global__ MultiplyDescriptorG_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2,
1522 cudaTextureObject_t texLoc1, cudaTextureObject_t texLoc2,
1523 int* d_result, int num1, int num2, int3* d_temp,
1524 Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
1526 int idx01 = (blockIdx.y * MULT_BLOCK_DIMY);
1527 int idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
1529 int idx1 = idx01 + threadIdx.y;
1530 int idx2 = idx02 + threadIdx.x;
1531 __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1532 __shared__ float loc1[MULT_BLOCK_DIMY * 2];
1533 int read_idx1 = idx01 * 8 + threadIdx.x ;
1534 int read_idx2 = idx2 * 8;
1535 int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1536 int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1537 #if MULT_BLOCK_DIMY == 16
1538 uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1539 data1[cache_idx1] = v.x;
1540 data1[cache_idx1+1] = v.y;
1541 data1[cache_idx1+2] = v.z;
1542 data1[cache_idx1+3] = v.w;
1543 #elif MULT_BLOCK_DIMY == 8
1544 if(threadIdx.x < 64)
1546 uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1547 data1[cache_idx1] = v.x;
1548 data1[cache_idx1+1] = v.y;
1549 data1[cache_idx1+2] = v.z;
1550 data1[cache_idx1+3] = v.w;
1556 if(threadIdx.x < MULT_BLOCK_DIMY * 2)
1558 loc1[threadIdx.x] = tex1Dfetch<float>(texLoc1, 2 * idx01 + threadIdx.x);
1561 if(idx2 >= num2) return;
1562 int results[MULT_BLOCK_DIMY];
1563 /////////////////////////////////////////////////////////////////////////////////////////////
1564 //geometric verification
1565 /////////////////////////////////////////////////////////////////////////////////////////////
1567 float2 loc2 = tex1Dfetch<float2>(texLoc2, idx2);
1569 for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1574 float* loci = loc1 + i * 2;
1575 float locx = loci[0], locy = loci[1];
1577 float x[3], diff[2];
1578 x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];
1579 x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];
1580 x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];
1581 diff[0] = FDIV(x[0], x[2]) - loc2.x;
1582 diff[1] = FDIV(x[1], x[2]) - loc2.y;
1583 float hdist = diff[0] * diff[0] + diff[1] * diff[1];
1584 if(hdist < hdistmax)
1586 //check fundamental matrix
1587 float fx1[3], ftx2[3], x2fx1, se;
1588 fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];
1589 fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];
1590 fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
1592 ftx2[0] = F.mat[0][0] * loc2.x + F.mat[1][0] * loc2.y + F.mat[2][0];
1593 ftx2[1] = F.mat[0][1] * loc2.x + F.mat[1][1] * loc2.y + F.mat[2][1];
1594 //ftx2[2] = F.mat[0][2] * loc2.x + F.mat[1][2] * loc2.y + F.mat[2][2];
1596 x2fx1 = loc2.x * fx1[0] + loc2.y * fx1[1] + fx1[2];
1597 se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);
1598 results[i] = se < fdistmax? 0: -262144;
1601 results[i] = -262144;
1605 results[i] = -262144;
1607 good_count += (results[i] >=0);
1609 /////////////////////////////////////////////////////////////////////////////////////////////
1610 ///compare feature descriptors anyway
1611 /////////////////////////////////////////////////////////////////////////////////////////////
1615 for(int i = 0; i < 8; ++i)
1617 uint4 v = tex1Dfetch<uint4>(texDes2, read_idx2 + i);
1618 unsigned char* p2 = (unsigned char*)(&v);
1620 for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1622 unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4));
1623 results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1])
1624 + IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3])
1625 + IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5])
1626 + IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7])
1627 + IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9])
1628 + IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11])
1629 + IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13])
1630 + IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15]));
1634 int dst_idx = IMUL(idx1, num2) + idx2;
1637 int3 cmp_result = make_int3(0, -1, 0);
1639 for(int i= 0; i < MULT_BLOCK_DIMY; ++i)
1643 cmp_result = results[i] > cmp_result.x?
1644 make_int3(results[i], idx1 + i, cmp_result.x) :
1645 make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1646 d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1652 d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
1656 for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1658 if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1666 void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
1667 CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,
1668 float* H, float hdistmax, float* F, float fdistmax)
1670 int num1 = des1->GetImgWidth() / 8;
1671 int num2 = des2->GetImgWidth() / 8;
1672 Matrix33 MatF, MatH;
1674 memcpy(MatF.mat, F, 9 * sizeof(float));
1675 memcpy(MatH.mat, H, 9 * sizeof(float));
1677 dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1678 (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1679 dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1680 //intermediate results
1681 texDot->InitTexture( num2,num1);
1682 if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
1683 CuTexImage::CuTexObj loc1Tex = loc1->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1684 CuTexImage::CuTexObj loc2Tex = loc2->BindTexture(texDataDesc, cudaCreateChannelDesc<float2>());
1685 CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1686 CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1687 MultiplyDescriptorG_Kernel<<<grid, block>>>(des1Tex.handle, des2Tex.handle, loc1Tex.handle, loc2Tex.handle,
1688 (int*)texDot->_cuData, num1, num2,
1689 (texCRT? (int3*)texCRT->_cuData : NULL),
1690 MatH, hdistmax, MatF, fdistmax);
1693 #define ROWMATCH_BLOCK_WIDTH 32
1694 #define ROWMATCH_BLOCK_HEIGHT 1
1696 void __global__ RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)
1698 #if ROWMATCH_BLOCK_HEIGHT == 1
1699 __shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
1700 __shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
1701 __shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
1702 int row = blockIdx.y;
1704 __shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1705 __shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1706 __shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1707 int* dotmax = x_dotmax[threadIdx.y];
1708 int* dotnxt = x_dotnxt[threadIdx.y];
1709 int* dotidx = x_dotidx[threadIdx.y];
1710 int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
1713 int base_address = IMUL(row , num2);
1714 int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;
1715 for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)
1717 if(threadIdx.x + i < num2)
1719 int v = d_dot[base_address + threadIdx.x + i]; // tex1Dfetch(texDOT, base_address + threadIdx.x + i);
1720 bool test = v > t_dotmax;
1721 t_dotnxt = test? t_dotmax : max(t_dotnxt, v);
1722 t_dotidx = test? (threadIdx.x + i) : t_dotidx;
1723 t_dotmax = test? v: t_dotmax;
1727 dotmax[threadIdx.x] = t_dotmax;
1728 dotnxt[threadIdx.x] = t_dotnxt;
1729 dotidx[threadIdx.x] = t_dotidx;
1733 for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)
1735 if(threadIdx.x < step)
1737 int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
1738 bool test = v2 > v1;
1739 dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);
1740 dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];
1741 dotmax[threadIdx.x] = test? v2 : v1;
1745 if(threadIdx.x == 0)
1747 float dist = acos(min(dotmax[0] * 0.000003814697265625f, 1.0));
1748 float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));
1749 //float ratio = dist / distn;
1750 d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//? : -1;
1756 void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)
1758 int num1 = texDot->GetImgHeight();
1759 int num2 = texDot->GetImgWidth();
1760 dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
1761 dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
1762 RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
1763 (int*)texMatch->_cuData, num2, distmax, ratiomax);
1766 #define COLMATCH_BLOCK_WIDTH 32
1768 void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
1770 int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
1771 if(col >= num2) return;
1772 int3 result = d_crt[col];//tex1Dfetch(texCT, col);
1773 int read_idx = col + num2;
1774 for(int i = 1; i < height; ++i, read_idx += num2)
1776 int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
1777 result = result.x < temp.x?
1778 make_int3(temp.x, temp.y, max(result.x, temp.z)) :
1779 make_int3(result.x, result.y, max(result.z, temp.x));
1782 float dist = acos(min(result.x * 0.000003814697265625f, 1.0));
1783 float distn = acos(min(result.z * 0.000003814697265625f, 1.0));
1784 //float ratio = dist / distn;
1785 d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//? : -1;
1789 void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)
1791 int height = texCRT->GetImgHeight();
1792 int num2 = texCRT->GetImgWidth();
1793 //texCRT->BindTexture(texCT);
1794 dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);
1795 dim3 block(COLMATCH_BLOCK_WIDTH);
1796 ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);