ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ProgramCU.cu
Go to the documentation of this file.
1 ////////////////////////////////////////////////////////////////////////////
2 // File: ProgramCU.cu
3 // Author: Changchang Wu
4 // Description : implementation of ProgramCU and all CUDA kernels
5 //
6 // Copyright (c) 2007 University of North Carolina at Chapel Hill
7 // All Rights Reserved
8 //
9 // Permission to use, copy, modify and distribute this software and its
10 // documentation for educational, research and non-profit purposes, without
11 // fee, and without a written agreement is hereby granted, provided that the
12 // above copyright notice and the following paragraph appear in all copies.
13 //
14 // The University of North Carolina at Chapel Hill make no representations
15 // about the suitability of this software for any purpose. It is provided
16 // 'as is' without express or implied warranty.
17 //
18 // Please send BUG REPORTS to ccwu@cs.unc.edu
19 //
20 ////////////////////////////////////////////////////////////////////////////
21 
22 #if defined(SIFTGPU_CUDA_ENABLED)
23 
24 #include "GL/glew.h"
25 #include "stdio.h"
26 
27 #include "CuTexImage.h"
28 #include "ProgramCU.h"
29 #include "GlobalUtil.h"
30 
31 //----------------------------------------------------------------
32 //Begin SiftGPU setting section.
33 //////////////////////////////////////////////////////////
34 #define IMUL(X,Y) __mul24(X,Y)
35 //#define FDIV(X,Y) ((X)/(Y))
36 #define FDIV(X,Y) __fdividef(X,Y)
37 
38 /////////////////////////////////////////////////////////
39 //filter kernel width range (don't change this)
40 #define KERNEL_MAX_WIDTH 33
41 #define KERNEL_MIN_WIDTH 5
42 
43 //////////////////////////////////////////////////////////
44 //horizontal filter block size (32, 64, 128, 256, 512)
45 #define FILTERH_TILE_WIDTH 128
46 //thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)
47 #define FILTERV_BLOCK_WIDTH 16
48 #define FILTERV_BLOCK_HEIGHT 32
49 //The corresponding image patch for a thread block
50 #define FILTERV_PIXEL_PER_THREAD 4
51 #define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH
52 #define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)
53 
54 
55 //////////////////////////////////////////////////////////
56 //thread block size for computing Difference of Gaussian
57 #define DOG_BLOCK_LOG_DIMX 7
58 #define DOG_BLOCK_LOG_DIMY 0
59 #define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX)
60 #define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY)
61 
62 //////////////////////////////////////////////////////////
63 //thread block size for keypoint detection
64 #define KEY_BLOCK_LOG_DIMX 3
65 #define KEY_BLOCK_LOG_DIMY 3
66 #define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)
67 #define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)
68 //#define KEY_OFFSET_ONE
69 //make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..
70 //but it seems uncoalesced writes don't affect the speed
71 
72 //////////////////////////////////////////////////////////
73 //thread block size for initializing list generation (64, 128, 256, 512 ...)
74 #define HIST_INIT_WIDTH 128
75 //thread block size for generating feature list (32, 64, 128, 256, 512, ...)
76 #define LISTGEN_BLOCK_DIM 128
77 
78 
79 /////////////////////////////////////////////////////////
80 //how many keypoint orientations to compute in a block
81 #define ORIENTATION_COMPUTE_PER_BLOCK 64
82 //how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)
83 #define DESCRIPTOR_COMPUTE_PER_BLOCK 4
84 #define DESCRIPTOR_COMPUTE_BLOCK_SIZE (16 * DESCRIPTOR_COMPUTE_PER_BLOCK)
85 //how many keypoint descriptor to normalized in a block (32, ...)
86 #define DESCRIPTOR_NORMALIZ_PER_BLOCK 32
87 
88 
89 
90 ///////////////////////////////////////////
91 //Thread block size for visualization
92 //(This doesn't affect the speed of computation)
93 #define BLOCK_LOG_DIM 4
94 #define BLOCK_DIM (1 << BLOCK_LOG_DIM)
95 
96 //End SiftGPU setting section.
97 //----------------------------------------------------------------
98 
99 
100 __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
101 
102 const static cudaTextureDesc texDataDesc = []() {
103  cudaTextureDesc textureDesc;
104  memset(&textureDesc, 0, sizeof(textureDesc));
105  textureDesc.readMode = cudaReadModeElementType;
106  textureDesc.addressMode[0] = cudaAddressModeClamp;
107  textureDesc.addressMode[1] = cudaAddressModeClamp;
108  textureDesc.addressMode[2] = cudaAddressModeClamp;
109  textureDesc.filterMode = cudaFilterModePoint;
110  textureDesc.normalizedCoords = false;
111  return textureDesc;
112 }();
113 
114 const static cudaTextureDesc texDataBDesc = []() {
115  cudaTextureDesc textureDesc;
116  memset(&textureDesc, 0, sizeof(textureDesc));
117  textureDesc.readMode = cudaReadModeNormalizedFloat;
118  textureDesc.addressMode[0] = cudaAddressModeClamp;
119  textureDesc.addressMode[1] = cudaAddressModeClamp;
120  textureDesc.addressMode[2] = cudaAddressModeClamp;
121  textureDesc.filterMode = cudaFilterModePoint;
122  textureDesc.normalizedCoords = false;
123  return textureDesc;
124 }();
125 
126 //////////////////////////////////////////////////////////////
127 template<int FW> __global__ void FilterH(cudaTextureObject_t texData, float* d_result, int width)
128 {
129 
130  const int HALF_WIDTH = FW >> 1;
131  const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;
132  const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;
133  __shared__ float data[CACHE_WIDTH];
134  const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);
135  const int col = bcol + threadIdx.x;
136  const int index_min = IMUL(blockIdx.y, width);
137  const int index_max = index_min + width - 1;
138  int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
139  int cache_index = threadIdx.x;
140  float value = 0;
141 #pragma unroll
142  for(int j = 0; j < CACHE_COUNT; ++j)
143  {
144  if(cache_index < CACHE_WIDTH)
145  {
146  int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
147  data[cache_index] = tex1Dfetch<float>(texData,fetch_index);
148  src_index += FILTERH_TILE_WIDTH;
149  cache_index += FILTERH_TILE_WIDTH;
150  }
151  }
152  __syncthreads();
153  if(col >= width) return;
154 #pragma unroll
155  for(int i = 0; i < FW; ++i)
156  {
157  value += (data[threadIdx.x + i]* d_kernel[i]);
158  }
159 // value = Conv<FW-1>(data + threadIdx.x);
160  d_result[index_min + col] = value;
161 }
162 
163 
164 
165 ////////////////////////////////////////////////////////////////////
166 template<int FW> __global__ void FilterV(cudaTextureObject_t texData, float* d_result, int width, int height)
167 {
168  const int HALF_WIDTH = FW >> 1;
169  const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
170  const int TEMP = CACHE_WIDTH & 0xf;
171 //add some extra space to avoid bank conflict
172 #if FILTERV_TILE_WIDTH == 16
173  //make the stride 16 * n +/- 1
174  const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;
175 #elif FILTERV_TILE_WIDTH == 8
176  //make the stride 16 * n +/- 2
177  const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);
178 #elif FILTERV_TILE_WIDTH == 4
179  //make the stride 16 * n +/- 4
180  const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);
181 #else
182 #error
183 #endif
184  const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;
185  const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT;
186  const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT;
187  __shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
188  const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);
189  const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;
190  const int row_first = row_block_first - HALF_WIDTH;
191  const int data_index_max = IMUL(height - 1, width) + col;
192  const int cache_col_start = threadIdx.y;
193  const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);
194  int cache_index = cache_col_start + cache_row_start;
195  int data_index = IMUL(row_first + cache_col_start, width) + col;
196 
197  if(col < width)
198  {
199 #pragma unroll
200  for(int i = 0; i < CACHE_COUNT; ++i)
201  {
202  if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
203  {
204  int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
205  data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch<float>(texData,fetch_index);
206  data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);
207  }
208  }
209  }
210  __syncthreads();
211 
212  if(col >= width) return;
213 
214  int row = row_block_first + threadIdx.y;
215  int index_start = cache_row_start + threadIdx.y;
216 #pragma unroll
217  for(int i = 0; i < WRITE_COUNT; ++i,
218  row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT)
219  {
220  if(row < height)
221  {
222  int index_dest = IMUL(row, width) + col;
223  float value = 0;
224 #pragma unroll
225  for(int i = 0; i < FW; ++i)
226  {
227  value += (data[index_start + i] * d_kernel[i]);
228  }
229  d_result[index_dest] = value;
230  }
231  }
232 }
233 
234 
235 template<int LOG_SCALE> __global__ void UpsampleKernel(cudaTextureObject_t texData, float* d_result, int width)
236 {
237  const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
238  const float INV_SCALE = 1.0f / (float(SCALE));
239  int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
240  if(col >= width) return;
241 
242  int row = blockIdx.y >> LOG_SCALE;
243  int index = row * width + col;
244  int dst_row = blockIdx.y;
245  int dst_idx= (width * dst_row + col) * SCALE;
246  int helper = blockIdx.y & SCALE_MASK;
247  if (helper)
248  {
249  float v11 = tex1Dfetch<float>(texData, index);
250  float v12 = tex1Dfetch<float>(texData, index + 1);
251  index += width;
252  float v21 = tex1Dfetch<float>(texData, index);
253  float v22 = tex1Dfetch<float>(texData, index + 1);
254  float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
255  float v1 = (v21 * w1 + w2 * v11);
256  float v2 = (v22 * w1 + w2 * v12);
257  d_result[dst_idx] = v1;
258 #pragma unroll
259  for(int i = 1; i < SCALE; ++i)
260  {
261  const float r2 = i * INV_SCALE;
262  const float r1 = 1.0f - r2;
263  d_result[dst_idx +i] = v1 * r1 + v2 * r2;
264  }
265  }else
266  {
267  float v1 = tex1Dfetch<float>(texData, index);
268  float v2 = tex1Dfetch<float>(texData, index + 1);
269  d_result[dst_idx] = v1;
270 #pragma unroll
271  for(int i = 1; i < SCALE; ++i)
272  {
273  const float r2 = i * INV_SCALE;
274  const float r1 = 1.0f - r2;
275  d_result[dst_idx +i] = v1 * r1 + v2 * r2;
276  }
277  }
278 
279 }
280 
281 ////////////////////////////////////////////////////////////////////////////////////////
282 void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
283 {
284  int width = src->GetImgWidth(), height = src->GetImgHeight();
285  CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
286  dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
287  dim3 block(FILTERH_TILE_WIDTH);
288  switch(log_scale)
289  {
290  case 1 : UpsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
291  case 2 : UpsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
292  case 3 : UpsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
293  default: break;
294  }
295 }
296 
297 template<int LOG_SCALE> __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width)
298 {
299  const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
300  if(dst_col >= dst_width) return;
301  const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));
302  const int dst_row = blockIdx.y;
303  const int src_row = blockIdx.y << LOG_SCALE;
304  const int src_idx = IMUL(src_row, src_width) + src_col;
305  const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
306  d_result[dst_idx] = tex1Dfetch<float>(texData, src_idx);
307 
308 }
309 
310 __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width, const int log_scale)
311 {
312  const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
313  if(dst_col >= dst_width) return;
314  const int src_col = min((dst_col << log_scale), (src_width - 1));
315  const int dst_row = blockIdx.y;
316  const int src_row = blockIdx.y << log_scale;
317  const int src_idx = IMUL(src_row, src_width) + src_col;
318  const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
319  d_result[dst_idx] = tex1Dfetch<float>(texData, src_idx);
320 
321 }
322 
323 void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
324 {
325  int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
326 
327  CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
328  dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
329  dim3 block(FILTERH_TILE_WIDTH);
330  switch(log_scale)
331  {
332  case 1 : DownsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
333  case 2 : DownsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
334  case 3 : DownsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
335  default: DownsampleKernel <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width, log_scale);
336  }
337 }
338 
339 __global__ void ChannelReduce_Kernel(cudaTextureObject_t texData, float* d_result)
340 {
341  int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
342  d_result[index] = tex1Dfetch<float>(texData, index*4);
343 }
344 
345 __global__ void ChannelReduce_Convert_Kernel(cudaTextureObject_t texDataF4, float* d_result)
346 {
347  int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
348  float4 rgba = tex1Dfetch<float4>(texDataF4, index);
349  d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
350 }
351 
352 void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)
353 {
354  int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
355 
356  dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
357  dim3 block(FILTERH_TILE_WIDTH);
358  if(convert_rgb)
359  {
360  CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
361  ChannelReduce_Convert_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
362  }else
363  {
364  CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
365  ChannelReduce_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
366  }
367 }
368 
369 __global__ void ConvertByteToFloat_Kernel(cudaTextureObject_t texDataB, float* d_result)
370 {
371  int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
372  d_result[index] = tex1Dfetch<float>(texDataB, index);
373 }
374 
375 void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
376 {
377  int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
378  dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
379  dim3 block(FILTERH_TILE_WIDTH);
380  CuTexImage::CuTexObj srcTex = src->BindTexture(texDataBDesc, cudaCreateChannelDesc<float>());
381  ConvertByteToFloat_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
382 }
383 
384 void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
385 {
386  int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
387  width = 2*sz + 1;
388 
389  if(width > KERNEL_MAX_WIDTH)
390  {
391  //filter size truncation
392  sz = KERNEL_MAX_WIDTH >> 1;
393  width =KERNEL_MAX_WIDTH;
394  }else if(width < KERNEL_MIN_WIDTH)
395  {
396  sz = KERNEL_MIN_WIDTH >> 1;
397  width =KERNEL_MIN_WIDTH;
398  }
399 
400  float rv = 1.0f/(sigma*sigma), v, ksum =0;
401 
402  // pre-compute filter
403  for( i = -sz ; i <= sz ; ++i)
404  {
405  kernel[i+sz] = v = exp(-0.5f * i * i *rv) ;
406  ksum += v;
407  }
408 
409  //normalize the kernel
410  rv = 1.0f/ksum;
411  for(i = 0; i< width ;i++) kernel[i]*=rv;
412 }
413 
414 
415 template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)
416 {
417  int width = src->GetImgWidth(), height = src->GetImgHeight();
418 
419  //horizontal filtering
420  CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
421  dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
422  dim3 blockh(FILTERH_TILE_WIDTH);
423  FilterH<FW><<<gridh, blockh>>>(srcTex.handle, (float*)buf->_cuData, width);
424  CheckErrorCUDA("FilterH");
425 
426  ///vertical filtering
427  CuTexImage::CuTexObj bufTex = buf->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
428  dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
429  dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);
430  FilterV<FW><<<gridv, blockv>>>(bufTex.handle, (float*)dst->_cuData, width, height);
431  CheckErrorCUDA("FilterV");
432 }
433 
434 //////////////////////////////////////////////////////////////////////
435 // tested on 2048x1500 image, the time on pyramid construction is
436 // OpenGL version : 18ms
437 // CUDA version: 28 ms
438 void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)
439 {
440  float filter_kernel[KERNEL_MAX_WIDTH]; int width;
441  CreateFilterKernel(sigma, filter_kernel, width);
442  cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);
443 
444  switch(width)
445  {
446  case 5: FilterImage< 5>(dst, src, buf); break;
447  case 7: FilterImage< 7>(dst, src, buf); break;
448  case 9: FilterImage< 9>(dst, src, buf); break;
449  case 11: FilterImage<11>(dst, src, buf); break;
450  case 13: FilterImage<13>(dst, src, buf); break;
451  case 15: FilterImage<15>(dst, src, buf); break;
452  case 17: FilterImage<17>(dst, src, buf); break;
453  case 19: FilterImage<19>(dst, src, buf); break;
454  case 21: FilterImage<21>(dst, src, buf); break;
455  case 23: FilterImage<23>(dst, src, buf); break;
456  case 25: FilterImage<25>(dst, src, buf); break;
457  case 27: FilterImage<27>(dst, src, buf); break;
458  case 29: FilterImage<29>(dst, src, buf); break;
459  case 31: FilterImage<31>(dst, src, buf); break;
460  case 33: FilterImage<33>(dst, src, buf); break;
461  default: break;
462  }
463 
464 }
465 
466 
467 void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, float2* d_got, int width, int height)
468 {
469  int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
470  int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
471  if(col < width && row < height)
472  {
473  int index = IMUL(row, width) + col;
474  float vp = tex1Dfetch<float>(texP, index);
475  float v = tex1Dfetch<float>(texC, index);
476  d_dog[index] = v - vp;
477  float vxn = tex1Dfetch<float>(texC, index + 1);
478  float vxp = tex1Dfetch<float>(texC, index - 1);
479  float vyp = tex1Dfetch<float>(texC, index - width);
480  float vyn = tex1Dfetch<float>(texC, index + width);
481  float dx = vxn - vxp, dy = vyn - vyp;
482  float grd = 0.5f * sqrt(dx * dx + dy * dy);
483  float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
484  d_got[index] = make_float2(grd, rot);
485  }
486 }
487 
488 void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, int width, int height)
489 {
490  int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
491  int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
492  if(col < width && row < height)
493  {
494  int index = IMUL(row, width) + col;
495  float vp = tex1Dfetch<float>(texP, index);
496  float v = tex1Dfetch<float>(texC, index);
497  d_dog[index] = v - vp;
498  }
499 }
500 
501 void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
502 {
503  int width = gus->GetImgWidth(), height = gus->GetImgHeight();
504  dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
505  dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
506  CuTexImage::CuTexObj texCObj = gus->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
507  CuTexImage::CuTexObj texPObj = (gus-1)->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
508  if(got->_cuData)
509  ComputeDOG_Kernel<<<grid, block>>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, (float2*) got->_cuData, width, height);
510  else
511  ComputeDOG_Kernel<<<grid, block>>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, width, height);
512 }
513 
514 
515 #define READ_CMP_DOG_DATA(datai, tex, idx) \
516  datai[0] = tex1Dfetch<float>(tex, idx - 1);\
517  datai[1] = tex1Dfetch<float>(tex, idx);\
518  datai[2] = tex1Dfetch<float>(tex, idx + 1);\
519  if(v > nmax)\
520  {\
521  nmax = max(nmax, datai[0]);\
522  nmax = max(nmax, datai[1]);\
523  nmax = max(nmax, datai[2]);\
524  if(v < nmax) goto key_finish;\
525  }else\
526  {\
527  nmin = min(nmin, datai[0]);\
528  nmin = min(nmin, datai[1]);\
529  nmin = min(nmin, datai[2]);\
530  if(v > nmin) goto key_finish;\
531  }
532 
533 
534 void __global__ ComputeKEY_Kernel(cudaTextureObject_t texP, cudaTextureObject_t texC, cudaTextureObject_t texN, float4* d_key, int width, int colmax, int rowmax,
535  float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization)
536 {
537  float data[3][3], v;
538  float datap[3][3], datan[3][3];
539 #ifdef KEY_OFFSET_ONE
540  int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;
541  int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;
542 #else
543  int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;
544  int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;
545 #endif
546  int index = IMUL(row, width) + col;
547  int idx[3] ={index - width, index, index + width};
548  int in_image =0;
549  float nmax, nmin, result = 0.0f;
550  float dx = 0, dy = 0, ds = 0;
551  bool offset_test_passed = true;
552 #ifdef KEY_OFFSET_ONE
553  if(row < rowmax && col < colmax)
554 #else
555  if(row > 0 && col > 0 && row < rowmax && col < colmax)
556 #endif
557  {
558  in_image = 1;
559  data[1][1] = v = tex1Dfetch<float>(texC, idx[1]);
560  if(fabs(v) <= dog_threshold0) goto key_finish;
561 
562  data[1][0] = tex1Dfetch<float>(texC, idx[1] - 1);
563  data[1][2] = tex1Dfetch<float>(texC, idx[1] + 1);
564  nmax = max(data[1][0], data[1][2]);
565  nmin = min(data[1][0], data[1][2]);
566 
567  if(v <=nmax && v >= nmin) goto key_finish;
568  //if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
569  READ_CMP_DOG_DATA(data[0], texC, idx[0]);
570  READ_CMP_DOG_DATA(data[2], texC, idx[2]);
571 
572  //edge supression
573  float vx2 = v * 2.0f;
574  float fxx = data[1][0] + data[1][2] - vx2;
575  float fyy = data[0][1] + data[2][1] - vx2;
576  float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);
577  float temp1 = fxx * fyy - fxy * fxy;
578  float temp2 = (fxx + fyy) * (fxx + fyy);
579  if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;
580 
581 
582  //read the previous level
583  READ_CMP_DOG_DATA(datap[0], texP, idx[0]);
584  READ_CMP_DOG_DATA(datap[1], texP, idx[1]);
585  READ_CMP_DOG_DATA(datap[2], texP, idx[2]);
586 
587 
588  //read the next level
589  READ_CMP_DOG_DATA(datan[0], texN, idx[0]);
590  READ_CMP_DOG_DATA(datan[1], texN, idx[1]);
591  READ_CMP_DOG_DATA(datan[2], texN, idx[2]);
592 
593  if(subpixel_localization)
594  {
595  //subpixel localization
596  float fx = 0.5f * (data[1][2] - data[1][0]);
597  float fy = 0.5f * (data[2][1] - data[0][1]);
598  float fs = 0.5f * (datan[1][1] - datap[1][1]);
599 
600  float fss = (datan[1][1] + datap[1][1] - vx2);
601  float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);
602  float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);
603 
604  //need to solve dx, dy, ds;
605  // |-fx| | fxx fxy fxs | |dx|
606  // |-fy| = | fxy fyy fys | * |dy|
607  // |-fs| | fxs fys fss | |ds|
608  float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
609  float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
610  float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
611  float maxa = max(max(A0.x, A1.x), A2.x);
612  if(maxa >= 1e-10)
613  {
614  if(maxa == A1.x)
615  {
616  float4 TEMP = A1; A1 = A0; A0 = TEMP;
617  }else if(maxa == A2.x)
618  {
619  float4 TEMP = A2; A2 = A0; A0 = TEMP;
620  }
621  A0.y /= A0.x; A0.z /= A0.x; A0.w/= A0.x;
622  A1.y -= A1.x * A0.y; A1.z -= A1.x * A0.z; A1.w -= A1.x * A0.w;
623  A2.y -= A2.x * A0.y; A2.z -= A2.x * A0.z; A2.w -= A2.x * A0.w;
624  if(abs(A2.y) > abs(A1.y))
625  {
626  float4 TEMP = A2; A2 = A1; A1 = TEMP;
627  }
628  if(abs(A1.y) >= 1e-10)
629  {
630  A1.z /= A1.y; A1.w /= A1.y;
631  A2.z -= A2.y * A1.z; A2.w -= A2.y * A1.w;
632  if(abs(A2.z) >= 1e-10)
633  {
634  ds = A2.w / A2.z;
635  dy = A1.w - ds * A1.z;
636  dx = A0.w - ds * A0.z - dy * A0.y;
637 
638  offset_test_passed =
639  fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold
640  &&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;
641  }
642  }
643  }
644  }
645  if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;
646  }
647 key_finish:
648  if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
649 }
650 
651 
652 void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)
653 {
654  int width = dog->GetImgWidth(), height = dog->GetImgHeight();
655  float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;
656  CuTexImage* dogp = dog - 1;
657  CuTexImage* dogn = dog + 1;
658 #ifdef KEY_OFFSET_ONE
659  dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
660 #else
661  dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
662 #endif
663  dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
664  CuTexImage::CuTexObj texPObj = dogp->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
665  CuTexImage::CuTexObj texCObj = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
666  CuTexImage::CuTexObj texNObj = dogn->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
667  Tedge = (Tedge+1)*(Tedge+1)/Tedge;
668  ComputeKEY_Kernel<<<grid, block>>>(texPObj.handle, texCObj.handle, texNObj.handle, (float4*) key->_cuData, width,
669  width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
670 
671 }
672 
673 
674 
675 void __global__ InitHist_Kernel(cudaTextureObject_t texDataF4, int4* hist, int ws, int wd, int height)
676 {
677  int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
678  int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
679  if(row < height && col < wd)
680  {
681  int hidx = IMUL(row, wd) + col;
682  int scol = col << 2;
683  int sidx = IMUL(row, ws) + scol;
684  int v[4] = {0, 0, 0, 0};
685  if(row > 0 && row < height -1)
686  {
687 #pragma unroll
688  for(int i = 0; i < 4 ; ++i, ++scol)
689  {
690  float4 temp = tex1Dfetch<float4>(texDataF4, sidx +i);
691  v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
692  }
693  }
694  hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
695 
696  }
697 }
698 
699 
700 
701 void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
702 {
703  int ws = key->GetImgWidth(), hs = key->GetImgHeight();
704  int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
705  dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd);
706  dim3 block(HIST_INIT_WIDTH, 1);
707  CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
708  InitHist_Kernel<<<grid, block>>>(keyTex.handle, (int4*) hist->_cuData, ws, wd, hd);
709 }
710 
711 
712 
713 void __global__ ReduceHist_Kernel(cudaTextureObject_t texDataI4, int4* d_hist, int ws, int wd, int height)
714 {
715  int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
716  int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
717  if(row < height && col < wd)
718  {
719  int hidx = IMUL(row, wd) + col;
720  int scol = col << 2;
721  int sidx = IMUL(row, ws) + scol;
722  int v[4] = {0, 0, 0, 0};
723 #pragma unroll
724  for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
725  {
726  int4 temp = tex1Dfetch<int4>(texDataI4, sidx + i);
727  v[i] = temp.x + temp.y + temp.z + temp.w;
728  }
729  d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
730  }
731 }
732 
733 void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
734 {
735  int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
736  int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
737  int temp = (int)floorf(logf(float(wd * 2/ 3)) / logf(2.0f));
738  const int wi = min(7, max(temp , 0));
739  CuTexImage::CuTexObj hist1Tex = hist1->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
740 
741  const int BW = 1 << wi, BH = 1 << (7 - wi);
742  dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH);
743  dim3 block(BW, BH);
744  ReduceHist_Kernel<<<grid, block>>>(hist1Tex.handle, (int4*)hist2->_cuData, ws, wd, hd);
745 }
746 
747 
748 void __global__ ListGen_Kernel(cudaTextureObject_t texDataList, cudaTextureObject_t texDataI4, int4* d_list, int list_len, int width)
749 {
750  int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
751  int4 pos = tex1Dfetch<int4>(texDataList, idx1);
752  int idx2 = IMUL(pos.y, width) + pos.x;
753  int4 temp = tex1Dfetch<int4>(texDataI4, idx2);
754  int sum1 = temp.x + temp.y;
755  int sum2 = sum1 + temp.z;
756  pos.x <<= 2;
757  if(pos.z >= sum2)
758  {
759  pos.x += 3;
760  pos.z -= sum2;
761  }else if(pos.z >= sum1)
762  {
763  pos.x += 2;
764  pos.z -= sum1;
765  }else if(pos.z >= temp.x)
766  {
767  pos.x += 1;
768  pos.z -= temp.x;
769  }
770  if (idx1 < list_len) {
771  d_list[idx1] = pos;
772  }
773 }
774 
775 //input list (x, y) (x, y) ....
776 void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
777 {
778  int len = list->GetImgWidth();
779  CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
780  CuTexImage::CuTexObj histTex = hist->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
781  dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
782  dim3 block(LISTGEN_BLOCK_DIM);
783  ListGen_Kernel<<<grid, block>>>(listTex.handle, histTex.handle, (int4*) list->_cuData, len,
784  hist->GetImgWidth());
785 }
786 
787 void __global__ ComputeOrientation_Kernel(cudaTextureObject_t texDataF2,
788  cudaTextureObject_t texDataF4,
789  cudaTextureObject_t texDataList,
790  float4* d_list,
791  int list_len,
792  int width, int height,
793  float sigma, float sigma_step,
794  float gaussian_factor, float sample_factor,
795  int num_orientation,
796  int existing_keypoint,
797  int subpixel,
798  int keepsign)
799 {
800  const float ten_degree_per_radius = 5.7295779513082320876798154814105;
801  const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;
802  int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
803  if(idx >= list_len) return;
804  float4 key;
805  if(existing_keypoint)
806  {
807  key = tex1Dfetch<float4>(texDataF4, idx);
808  }else
809  {
810  int4 ikey = tex1Dfetch<int4>(texDataList, idx);
811  key.x = ikey.x + 0.5f;
812  key.y = ikey.y + 0.5f;
813  key.z = sigma;
814  if(subpixel || keepsign)
815  {
816  float4 offset = tex1Dfetch<float4>(texDataF4, IMUL(width, ikey.y) + ikey.x);
817  if(subpixel)
818  {
819  key.x += offset.y;
820  key.y += offset.z;
821  key.z *= pow(sigma_step, offset.w);
822  }
823  if(keepsign) key.z *= offset.x;
824  }
825  }
826  if(num_orientation == 0)
827  {
828  key.w = 0;
829  d_list[idx] = key;
830  return;
831  }
832  float vote[37];
833  float gsigma = key.z * gaussian_factor;
834  float win = fabs(key.z) * sample_factor;
835  float dist_threshold = win * win + 0.5;
836  float factor = -0.5f / (gsigma * gsigma);
837  float xmin = max(1.5f, floorf(key.x - win) + 0.5f);
838  float ymin = max(1.5f, floorf(key.y - win) + 0.5f);
839  float xmax = min(width - 1.5f, floorf(key.x + win) + 0.5f);
840  float ymax = min(height -1.5f, floorf(key.y + win) + 0.5f);
841 #pragma unroll
842  for(int i = 0; i < 36; ++i) vote[i] = 0.0f;
843  for(float y = ymin; y <= ymax; y += 1.0f)
844  {
845  for(float x = xmin; x <= xmax; x += 1.0f)
846  {
847  float dx = x - key.x;
848  float dy = y - key.y;
849  float sq_dist = dx * dx + dy * dy;
850  if(sq_dist >= dist_threshold) continue;
851  float2 got = tex2D<float2>(texDataF2, x, y);
852  float weight = got.x * exp(sq_dist * factor);
853  float fidx = floorf(got.y * ten_degree_per_radius);
854  int oidx = fidx;
855  if(oidx < 0) oidx += 36;
856  vote[oidx] += weight;
857  }
858  }
859 
860  //filter the vote
861 
862  const float one_third = 1.0 /3.0;
863 #pragma unroll
864  for(int i = 0; i < 6; ++i)
865  {
866  vote[36] = vote[0];
867  float pre = vote[35];
868 #pragma unroll
869  for(int j = 0; j < 36; ++j)
870  {
871  float temp = one_third * (pre + vote[j] + vote[j + 1]);
872  pre = vote[j]; vote[j] = temp;
873  }
874  }
875 
876  vote[36] = vote[0];
877  if(num_orientation == 1 || existing_keypoint)
878  {
879  int index_max = 0;
880  float max_vote = vote[0];
881 #pragma unroll
882  for(int i = 1; i < 36; ++i)
883  {
884  index_max = vote[i] > max_vote? i : index_max;
885  max_vote = max(max_vote, vote[i]);
886  }
887  float pre = vote[index_max == 0? 35 : index_max -1];
888  float next = vote[index_max + 1];
889  float weight = max_vote;
890  float off = 0.5f * FDIV(next - pre, weight + weight - next - pre);
891  key.w = radius_per_ten_degrees * (index_max + 0.5f + off);
892  d_list[idx] = key;
893 
894  }else
895  {
896  float max_vote = vote[0];
897 #pragma unroll
898  for(int i = 1; i < 36; ++i) max_vote = max(max_vote, vote[i]);
899 
900  float vote_threshold = max_vote * 0.8f;
901  float pre = vote[35];
902  float max_rot[2], max_vot[2] = {0, 0};
903  int ocount = 0;
904 #pragma unroll
905  for(int i =0; i < 36; ++i)
906  {
907  float next = vote[i + 1];
908  if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)
909  {
910  float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre);
911  float rot = i + di + 0.5f;
912  float weight = vote[i];
913  ///
914  if(weight > max_vot[1])
915  {
916  if(weight > max_vot[0])
917  {
918  max_vot[1] = max_vot[0];
919  max_rot[1] = max_rot[0];
920  max_vot[0] = weight;
921  max_rot[0] = rot;
922  }
923  else
924  {
925  max_vot[1] = weight;
926  max_rot[1] = rot;
927  }
928  ocount ++;
929  }
930  }
931  pre = vote[i];
932  }
933  float fr1 = max_rot[0] / 36.0f;
934  if(fr1 < 0) fr1 += 1.0f;
935  unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floorf(fr1 * 65535.0f));
936  unsigned short us2 = 65535;
937  if(ocount > 1)
938  {
939  float fr2 = max_rot[1] / 36.0f;
940  if(fr2 < 0) fr2 += 1.0f;
941  us2 = (unsigned short ) floorf(fr2 * 65535.0f);
942  }
943  unsigned int uspack = (us2 << 16) | us1;
944  key.w = __int_as_float(uspack);
945  d_list[idx] = key;
946  }
947 
948 }
949 
950 
951 
952 
953 void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key,
954  float sigma, float sigma_step, int existing_keypoint)
955 {
956  int len = list->GetImgWidth();
957  if(len <= 0) return;
958  int width = got->GetImgWidth(), height = got->GetImgHeight();
959  CuTexImage::CuTexObj texObjF4;
960  CuTexImage::CuTexObj texObjList;
961  if(existing_keypoint)
962  {
963  texObjF4 = list->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
964  }else
965  {
966  texObjList = list->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
967  if(GlobalUtil::_SubpixelLocalization)
968  {
969  texObjF4 = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
970  }
971  }
972 
973  CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc<float2>());
974 
975  const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK;
976  dim3 grid((len + block_width -1) / block_width);
977  dim3 block(block_width);
978 
979  ComputeOrientation_Kernel<<<grid, block>>>(
980  gotTex.handle,
981  texObjF4.handle,
982  texObjList.handle,
983  (float4*) list->_cuData,
984  len, width, height, sigma, sigma_step,
985  GlobalUtil::_OrientationGaussianFactor,
986  GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor,
987  GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation,
988  existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign);
989 
990  ProgramCU::CheckErrorCUDA("ComputeOrientation");
991 }
992 
993 template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num,
994  int width, int height, float window_factor)
995 {
996  const float rpi = 4.0/ 3.14159265358979323846;
997  int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
998  int fidx = idx >> 4;
999  if(fidx >= num) return;
1000  float4 key = tex1Dfetch<float4>(texDataF4, fidx);
1001  int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
1002  float spt = fabs(key.z * window_factor);
1003  float s, c; __sincosf(key.w, &s, &c);
1004  float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ;
1005  float cspt = c * spt, sspt = s * spt;
1006  float crspt = c / spt, srspt = s / spt;
1007  float2 offsetpt, pt;
1008  float xmin, ymin, xmax, ymax, bsz;
1009  offsetpt.x = ix - 1.5f;
1010  offsetpt.y = iy - 1.5f;
1011  pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x;
1012  pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y;
1013  bsz = fabs(cspt) + fabs(sspt);
1014  xmin = max(1.5f, floorf(pt.x - bsz) + 0.5f);
1015  ymin = max(1.5f, floorf(pt.y - bsz) + 0.5f);
1016  xmax = min(width - 1.5f, floorf(pt.x + bsz) + 0.5f);
1017  ymax = min(height - 1.5f, floorf(pt.y + bsz) + 0.5f);
1018  float des[9];
1019 #pragma unroll
1020  for(int i =0; i < 9; ++i) des[i] = 0.0f;
1021  for(float y = ymin; y <= ymax; y += 1.0f)
1022  {
1023  for(float x = xmin; x <= xmax; x += 1.0f)
1024  {
1025  float dx = x - pt.x;
1026  float dy = y - pt.y;
1027  float nx = crspt * dx + srspt * dy;
1028  float ny = crspt * dy - srspt * dx;
1029  float nxn = fabs(nx);
1030  float nyn = fabs(ny);
1031  if(nxn < 1.0f && nyn < 1.0f)
1032  {
1033  float2 cc = tex2D<float2>(texDataF2, x, y);
1034  float dnx = nx + offsetpt.x;
1035  float dny = ny + offsetpt.y;
1036  float ww = exp(-0.125f * (dnx * dnx + dny * dny));
1037  float wx = 1.0 - nxn;
1038  float wy = 1.0 - nyn;
1039  float weight = ww * wx * wy * cc.x;
1040  float theta = (anglef - cc.y) * rpi;
1041  if(theta < 0) theta += 8.0f;
1042  float fo = floorf(theta);
1043  int fidx = fo;
1044  float weight1 = fo + 1.0f - theta;
1045  float weight2 = theta - fo;
1046  if(DYNAMIC_INDEXING)
1047  {
1048  des[fidx] += (weight1 * weight);
1049  des[fidx + 1] += (weight2 * weight);
1050  //this dynamic indexing part might be slow
1051  }else
1052  {
1053  #pragma unroll
1054  for(int k = 0; k < 8; ++k)
1055  {
1056  if(k == fidx)
1057  {
1058  des[k] += (weight1 * weight);
1059  des[k+1] += (weight2 * weight);
1060  }
1061  }
1062  }
1063  }
1064  }
1065  }
1066  des[0] += des[8];
1067 
1068  int didx = idx << 1;
1069  d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1070  d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1071 }
1072 
1073 
1074 template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num,
1075  int width, int height, float window_factor)
1076 {
1077  const float rpi = 4.0/ 3.14159265358979323846;
1078  int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1079  int fidx = idx >> 4;
1080  if(fidx >= num) return;
1081  float4 key = tex1Dfetch<float4>(texDataF4, fidx);
1082  int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
1083  //float aspect_ratio = key.w / key.z;
1084  //float aspect_sq = aspect_ratio * aspect_ratio;
1085  float sptx = key.z * 0.25, spty = key.w * 0.25;
1086  float xmin, ymin, xmax, ymax; float2 pt;
1087  pt.x = sptx * (ix + 0.5f) + key.x;
1088  pt.y = spty * (iy + 0.5f) + key.y;
1089  xmin = max(1.5f, floorf(pt.x - sptx) + 0.5f);
1090  ymin = max(1.5f, floorf(pt.y - spty) + 0.5f);
1091  xmax = min(width - 1.5f, floorf(pt.x + sptx) + 0.5f);
1092  ymax = min(height - 1.5f, floorf(pt.y + spty) + 0.5f);
1093  float des[9];
1094 #pragma unroll
1095  for(int i =0; i < 9; ++i) des[i] = 0.0f;
1096  for(float y = ymin; y <= ymax; y += 1.0f)
1097  {
1098  for(float x = xmin; x <= xmax; x += 1.0f)
1099  {
1100  float nx = (x - pt.x) / sptx;
1101  float ny = (y - pt.y) / spty;
1102  float nxn = fabs(nx);
1103  float nyn = fabs(ny);
1104  if(nxn < 1.0f && nyn < 1.0f)
1105  {
1106  float2 cc = tex2D<float2>(texDataF2, x, y);
1107  float wx = 1.0 - nxn;
1108  float wy = 1.0 - nyn;
1109  float weight = wx * wy * cc.x;
1110  float theta = (- cc.y) * rpi;
1111  if(theta < 0) theta += 8.0f;
1112  float fo = floorf(theta);
1113  int fidx = fo;
1114  float weight1 = fo + 1.0f - theta;
1115  float weight2 = theta - fo;
1116  if(DYNAMIC_INDEXING)
1117  {
1118  des[fidx] += (weight1 * weight);
1119  des[fidx + 1] += (weight2 * weight);
1120  //this dynamic indexing part might be slow
1121  }else
1122  {
1123  #pragma unroll
1124  for(int k = 0; k < 8; ++k)
1125  {
1126  if(k == fidx)
1127  {
1128  des[k] += (weight1 * weight);
1129  des[k+1] += (weight2 * weight);
1130  }
1131  }
1132  }
1133  }
1134  }
1135  }
1136  des[0] += des[8];
1137 
1138  int didx = idx << 1;
1139  d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1140  d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1141 }
1142 
1143 void __global__ NormalizeDescriptor_Kernel(cudaTextureObject_t texDataF4, float4* d_des, int num)
1144 {
1145  float4 temp[32];
1146  int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1147  if(idx >= num) return;
1148  int sidx = idx << 5;
1149  float norm1 = 0, norm2 = 0;
1150 #pragma unroll
1151  for(int i = 0; i < 32; ++i)
1152  {
1153  temp[i] = tex1Dfetch<float4>(texDataF4, sidx +i);
1154  norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1155  temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1156  }
1157  norm1 = rsqrt(norm1);
1158 
1159 #pragma unroll
1160  for(int i = 0; i < 32; ++i)
1161  {
1162  temp[i].x = min(0.2f, temp[i].x * norm1);
1163  temp[i].y = min(0.2f, temp[i].y * norm1);
1164  temp[i].z = min(0.2f, temp[i].z * norm1);
1165  temp[i].w = min(0.2f, temp[i].w * norm1);
1166  norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1167  temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1168  }
1169 
1170  norm2 = rsqrt(norm2);
1171 #pragma unroll
1172  for(int i = 0; i < 32; ++i)
1173  {
1174  temp[i].x *= norm2; temp[i].y *= norm2;
1175  temp[i].z *= norm2; temp[i].w *= norm2;
1176  d_des[sidx + i] = temp[i];
1177  }
1178 }
1179 
1180 void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream)
1181 {
1182  int num = list->GetImgWidth();
1183  int width = got->GetImgWidth();
1184  int height = got->GetImgHeight();
1185 
1186  dtex->InitTexture(num * 128, 1, 1);
1187  CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc<float2>());
1188  CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1189  int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
1190  dim3 grid((num * 16 + block_width -1) / block_width);
1191  dim3 block(block_width);
1192 
1193  if(rect)
1194  {
1195  if(GlobalUtil::_UseDynamicIndexing)
1196  ComputeDescriptorRECT_Kernel<true><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1197  else
1198  ComputeDescriptorRECT_Kernel<false><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1199 
1200  }else
1201  {
1202  if(GlobalUtil::_UseDynamicIndexing)
1203  ComputeDescriptor_Kernel<true><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1204  else
1205  ComputeDescriptor_Kernel<false><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1206  }
1207  if(GlobalUtil::_NormalizedSIFT)
1208  {
1209  CuTexImage::CuTexObj dtexTex = dtex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1210  const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK;
1211  dim3 grid((num + block_width -1) / block_width);
1212  dim3 block(block_width);
1213  NormalizeDescriptor_Kernel<<<grid, block>>>(dtexTex.handle, (float4*) dtex->_cuData, num);
1214  }
1215  CheckErrorCUDA("ComputeDescriptor");
1216 }
1217 
1218 //////////////////////////////////////////////////////
1219 void ProgramCU::FinishCUDA()
1220 {
1221  cudaDeviceSynchronize();
1222 }
1223 
1224 int ProgramCU::CheckErrorCUDA(const char* location)
1225 {
1226  cudaError_t e = cudaGetLastError();
1227  if(e)
1228  {
1229  if(location) fprintf(stderr, "%s:\t", location);
1230  fprintf(stderr, "%s\n", cudaGetErrorString(e));
1231  //assert(0);
1232  return 1;
1233  }else
1234  {
1235  return 0;
1236  }
1237 }
1238 
1239 void __global__ ConvertDOG_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height)
1240 {
1241  int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1242  int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1243  if(col < width && row < height)
1244  {
1245  int index = row * width + col;
1246  float v = tex1Dfetch<float>(texData, index);
1247  d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1248  0.5 : __saturatef(0.5+20.0*v);
1249  }
1250 }
1251 ///
1252 void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
1253 {
1254  if(out->_cuData == NULL) return;
1255  int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
1256  CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1257  dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM);
1258  dim3 block(BLOCK_DIM, BLOCK_DIM);
1259  ConvertDOG_Kernel<<<grid, block>>>(dogTex.handle, (float*) out->_cuData, width, height);
1260  ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
1261 }
1262 
1263 void __global__ ConvertGRD_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height)
1264 {
1265  int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1266  int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1267  if(col < width && row < height)
1268  {
1269  int index = row * width + col;
1270  float v = tex1Dfetch<float>(texData, index << 1);
1271  d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1272  0 : __saturatef(5 * v);
1273 
1274  }
1275 }
1276 
1277 
1278 void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
1279 {
1280  if(out->_cuData == NULL) return;
1281  int width = got->GetImgWidth(), height = got ->GetImgHeight();
1282  CuTexImage::CuTexObj gotTex = got->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1283  dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM);
1284  dim3 block(BLOCK_DIM, BLOCK_DIM);
1285  ConvertGRD_Kernel<<<grid, block>>>(gotTex.handle, (float*) out->_cuData, width, height);
1286  ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
1287 }
1288 
1289 void __global__ ConvertKEY_Kernel(cudaTextureObject_t texData, cudaTextureObject_t texDataF4, float4* d_result, int width, int height)
1290 {
1291 
1292  int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1293  int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1294  if(col < width && row < height)
1295  {
1296  int index = row * width + col;
1297  float4 keyv = tex1Dfetch<float4>(texDataF4, index);
1298  int is_key = (keyv.x == 1.0f || keyv.x == -1.0f);
1299  int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;
1300  float v = inside? __saturatef(0.5 + 20 * tex1Dfetch<float>(texData, index)) : 0.5;
1301  d_result[index] = is_key && inside ?
1302  (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)):
1303  make_float4(v, v, v, 1.0f) ;
1304  }
1305 }
1306 void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)
1307 {
1308  if(out->_cuData == NULL) return;
1309  int width = key->GetImgWidth(), height = key ->GetImgHeight();
1310  CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1311  CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1312  dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM);
1313  dim3 block(BLOCK_DIM, BLOCK_DIM);
1314  ConvertKEY_Kernel<<<grid, block>>>(dogTex.handle, keyTex.handle, (float4*) out->_cuData, width, height);
1315 }
1316 
1317 
1318 void __global__ DisplayKeyPoint_Kernel(cudaTextureObject_t texDataF4, float4 * d_result, int num)
1319 {
1320  int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1321  if(idx >= num) return;
1322  float4 v = tex1Dfetch<float4>(texDataF4, idx);
1323  d_result[idx] = make_float4(v.x, v.y, 0, 1.0f);
1324 }
1325 
1326 void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
1327 {
1328  int num = ftex->GetImgWidth();
1329  int block_width = 64;
1330  dim3 grid((num + block_width -1) /block_width);
1331  dim3 block(block_width);
1332  CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1333  DisplayKeyPoint_Kernel<<<grid, block>>>(ftexTex.handle, (float4*) out->_cuData, num);
1334  ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
1335 }
1336 
1337 void __global__ DisplayKeyBox_Kernel(cudaTextureObject_t texDataF4, float4* d_result, int num)
1338 {
1339  int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1340  if(idx >= num) return;
1341  int kidx = idx / 10, vidx = idx - IMUL(kidx , 10);
1342  float4 v = tex1Dfetch<float4>(texDataF4, kidx);
1343  float sz = fabs(v.z * 3.0f);
1344  ///////////////////////
1345  float s, c; __sincosf(v.w, &s, &c);
1346  ///////////////////////
1347  float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz);
1348  float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz);
1349  float4 pos;
1350  pos.x = v.x + c * dx - s * dy;
1351  pos.y = v.y + c * dy + s * dx;
1352  pos.z = 0; pos.w = 1.0f;
1353  d_result[idx] = pos;
1354 }
1355 
1356 void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
1357 {
1358  int len = ftex->GetImgWidth();
1359  int block_width = 32;
1360  dim3 grid((len * 10 + block_width -1) / block_width);
1361  dim3 block(block_width);
1362  CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
1363  DisplayKeyBox_Kernel<<<grid, block>>>(ftexTex.handle, (float4*) out->_cuData, len * 10);
1364 }
1365 
1366 int ProgramCU::CheckCudaDevice(int device)
1367 {
1368  int count = 0, device_used;
1369  if(cudaGetDeviceCount(&count) != cudaSuccess || count <= 0)
1370  {
1371  ProgramCU::CheckErrorCUDA("CheckCudaDevice");
1372  return 0;
1373  }else if(count == 1)
1374  {
1375  cudaDeviceProp deviceProp;
1376  if ( cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess ||
1377  (deviceProp.major == 9999 && deviceProp.minor == 9999))
1378  {
1379  fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
1380  return 0;
1381  }else
1382  {
1383  GlobalUtil::_MemCapGPU = deviceProp.totalGlobalMem / 1024;
1384  GlobalUtil::_texMaxDimGL = 32768;
1385  if(GlobalUtil::_verbose)
1386  fprintf(stdout, "NOTE: changing maximum texture dimension to %d\n", GlobalUtil::_texMaxDimGL);
1387 
1388  }
1389  }
1390  if(device >0 && device < count)
1391  {
1392  cudaSetDevice(device);
1393  CheckErrorCUDA("cudaSetDevice\n");
1394  }
1395  cudaGetDevice(&device_used);
1396  if(device != device_used)
1397  fprintf(stderr, "\nERROR: Cannot set device to %d\n"
1398  "\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);
1399  return 1;
1400 }
1401 
1402 ////////////////////////////////////////////////////////////////////////////////////////
1403 // siftmatch funtions
1404 //////////////////////////////////////////////////////////////////////////////////////////
1405 
1406 #define MULT_TBLOCK_DIMX 128
1407 #define MULT_TBLOCK_DIMY 1
1408 #define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
1409 #define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)
1410 
1411 void __global__ MultiplyDescriptor_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2, int* d_result, int num1, int num2, int3* d_temp)
1412 {
1413  int idx01 = (blockIdx.y * MULT_BLOCK_DIMY), idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
1414 
1415  int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
1416  __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1417  int read_idx1 = idx01 * 8 + threadIdx.x, read_idx2 = idx2 * 8;
1418  int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1419  int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1420 
1421  ///////////////////////////////////////////////////////////////
1422  //Load feature descriptors
1423  ///////////////////////////////////////////////////////////////
1424 #if MULT_BLOCK_DIMY == 16
1425  uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1426  data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
1427  data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w;
1428 #elif MULT_BLOCK_DIMY == 8
1429  if(threadIdx.x < 64)
1430  {
1431  uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1432  data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
1433  data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w;
1434  }
1435 #else
1436 #error
1437 #endif
1438  __syncthreads();
1439 
1440  ///
1441  if(idx2 >= num2) return;
1442  ///////////////////////////////////////////////////////////////////////////
1443  //compare descriptors
1444 
1445  int results[MULT_BLOCK_DIMY];
1446 #pragma unroll
1447  for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;
1448 
1449 #pragma unroll
1450  for(int i = 0; i < 8; ++i)
1451  {
1452  uint4 v = tex1Dfetch<uint4>(texDes2, read_idx2 + i);
1453  unsigned char* p2 = (unsigned char*)(&v);
1454 #pragma unroll
1455  for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1456  {
1457  unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4));
1458  results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1])
1459  + IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3])
1460  + IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5])
1461  + IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7])
1462  + IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9])
1463  + IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11])
1464  + IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13])
1465  + IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15]));
1466  }
1467  }
1468 
1469  int dst_idx = IMUL(idx1, num2) + idx2;
1470  if(d_temp)
1471  {
1472  int3 cmp_result = make_int3(0, -1, 0);
1473 
1474 #pragma unroll
1475  for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1476  {
1477  if(idx1 + i < num1)
1478  {
1479  cmp_result = results[i] > cmp_result.x?
1480  make_int3(results[i], idx1 + i, cmp_result.x) :
1481  make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1482  d_result[dst_idx + IMUL(i, num2)] = results[i];
1483  }
1484  }
1485  d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
1486  }else
1487  {
1488 #pragma unroll
1489  for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1490  {
1491  if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
1492  }
1493  }
1494 
1495 }
1496 
1497 
1498 void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
1499 {
1500  int num1 = des1->GetImgWidth() / 8;
1501  int num2 = des2->GetImgWidth() / 8;
1502  dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1503  (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1504  dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1505  texDot->InitTexture( num2,num1);
1506  if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
1507  CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1508  CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1509 
1510  MultiplyDescriptor_Kernel<<<grid, block>>>(des1Tex.handle, des2Tex.handle, (int*)texDot->_cuData, num1, num2,
1511  (texCRT? (int3*)texCRT->_cuData : NULL));
1512 }
1513 
1514 struct Matrix33
1515 {
1516  float mat[3][3];
1517 };
1518 
1519 
1520 
1521 void __global__ MultiplyDescriptorG_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2,
1522  cudaTextureObject_t texLoc1, cudaTextureObject_t texLoc2,
1523  int* d_result, int num1, int num2, int3* d_temp,
1524  Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
1525 {
1526  int idx01 = (blockIdx.y * MULT_BLOCK_DIMY);
1527  int idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
1528 
1529  int idx1 = idx01 + threadIdx.y;
1530  int idx2 = idx02 + threadIdx.x;
1531  __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1532  __shared__ float loc1[MULT_BLOCK_DIMY * 2];
1533  int read_idx1 = idx01 * 8 + threadIdx.x ;
1534  int read_idx2 = idx2 * 8;
1535  int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1536  int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1537 #if MULT_BLOCK_DIMY == 16
1538  uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1539  data1[cache_idx1] = v.x;
1540  data1[cache_idx1+1] = v.y;
1541  data1[cache_idx1+2] = v.z;
1542  data1[cache_idx1+3] = v.w;
1543 #elif MULT_BLOCK_DIMY == 8
1544  if(threadIdx.x < 64)
1545  {
1546  uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
1547  data1[cache_idx1] = v.x;
1548  data1[cache_idx1+1] = v.y;
1549  data1[cache_idx1+2] = v.z;
1550  data1[cache_idx1+3] = v.w;
1551  }
1552 #else
1553 #error
1554 #endif
1555  __syncthreads();
1556  if(threadIdx.x < MULT_BLOCK_DIMY * 2)
1557  {
1558  loc1[threadIdx.x] = tex1Dfetch<float>(texLoc1, 2 * idx01 + threadIdx.x);
1559  }
1560  __syncthreads();
1561  if(idx2 >= num2) return;
1562  int results[MULT_BLOCK_DIMY];
1563  /////////////////////////////////////////////////////////////////////////////////////////////
1564  //geometric verification
1565  /////////////////////////////////////////////////////////////////////////////////////////////
1566  int good_count = 0;
1567  float2 loc2 = tex1Dfetch<float2>(texLoc2, idx2);
1568 #pragma unroll
1569  for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1570  {
1571 
1572  if(idx1 + i < num1)
1573  {
1574  float* loci = loc1 + i * 2;
1575  float locx = loci[0], locy = loci[1];
1576  //homography
1577  float x[3], diff[2];
1578  x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];
1579  x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];
1580  x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];
1581  diff[0] = FDIV(x[0], x[2]) - loc2.x;
1582  diff[1] = FDIV(x[1], x[2]) - loc2.y;
1583  float hdist = diff[0] * diff[0] + diff[1] * diff[1];
1584  if(hdist < hdistmax)
1585  {
1586  //check fundamental matrix
1587  float fx1[3], ftx2[3], x2fx1, se;
1588  fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];
1589  fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];
1590  fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
1591 
1592  ftx2[0] = F.mat[0][0] * loc2.x + F.mat[1][0] * loc2.y + F.mat[2][0];
1593  ftx2[1] = F.mat[0][1] * loc2.x + F.mat[1][1] * loc2.y + F.mat[2][1];
1594  //ftx2[2] = F.mat[0][2] * loc2.x + F.mat[1][2] * loc2.y + F.mat[2][2];
1595 
1596  x2fx1 = loc2.x * fx1[0] + loc2.y * fx1[1] + fx1[2];
1597  se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);
1598  results[i] = se < fdistmax? 0: -262144;
1599  }else
1600  {
1601  results[i] = -262144;
1602  }
1603  }else
1604  {
1605  results[i] = -262144;
1606  }
1607  good_count += (results[i] >=0);
1608  }
1609  /////////////////////////////////////////////////////////////////////////////////////////////
1610  ///compare feature descriptors anyway
1611  /////////////////////////////////////////////////////////////////////////////////////////////
1612  if(good_count > 0)
1613  {
1614 #pragma unroll
1615  for(int i = 0; i < 8; ++i)
1616  {
1617  uint4 v = tex1Dfetch<uint4>(texDes2, read_idx2 + i);
1618  unsigned char* p2 = (unsigned char*)(&v);
1619 #pragma unroll
1620  for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1621  {
1622  unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4));
1623  results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1])
1624  + IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3])
1625  + IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5])
1626  + IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7])
1627  + IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9])
1628  + IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11])
1629  + IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13])
1630  + IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15]));
1631  }
1632  }
1633  }
1634  int dst_idx = IMUL(idx1, num2) + idx2;
1635  if(d_temp)
1636  {
1637  int3 cmp_result = make_int3(0, -1, 0);
1638 #pragma unroll
1639  for(int i= 0; i < MULT_BLOCK_DIMY; ++i)
1640  {
1641  if(idx1 + i < num1)
1642  {
1643  cmp_result = results[i] > cmp_result.x?
1644  make_int3(results[i], idx1 + i, cmp_result.x) :
1645  make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1646  d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1647  }else
1648  {
1649  break;
1650  }
1651  }
1652  d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
1653  }else
1654  {
1655 #pragma unroll
1656  for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1657  {
1658  if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1659  else break;
1660  }
1661  }
1662 
1663 }
1664 
1665 
1666 void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
1667  CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,
1668  float* H, float hdistmax, float* F, float fdistmax)
1669 {
1670  int num1 = des1->GetImgWidth() / 8;
1671  int num2 = des2->GetImgWidth() / 8;
1672  Matrix33 MatF, MatH;
1673  //copy the matrix
1674  memcpy(MatF.mat, F, 9 * sizeof(float));
1675  memcpy(MatH.mat, H, 9 * sizeof(float));
1676  //thread blocks
1677  dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1678  (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1679  dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1680  //intermediate results
1681  texDot->InitTexture( num2,num1);
1682  if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
1683  CuTexImage::CuTexObj loc1Tex = loc1->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
1684  CuTexImage::CuTexObj loc2Tex = loc2->BindTexture(texDataDesc, cudaCreateChannelDesc<float2>());
1685  CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1686  CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
1687  MultiplyDescriptorG_Kernel<<<grid, block>>>(des1Tex.handle, des2Tex.handle, loc1Tex.handle, loc2Tex.handle,
1688  (int*)texDot->_cuData, num1, num2,
1689  (texCRT? (int3*)texCRT->_cuData : NULL),
1690  MatH, hdistmax, MatF, fdistmax);
1691 }
1692 
1693 #define ROWMATCH_BLOCK_WIDTH 32
1694 #define ROWMATCH_BLOCK_HEIGHT 1
1695 
1696 void __global__ RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)
1697 {
1698 #if ROWMATCH_BLOCK_HEIGHT == 1
1699  __shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
1700  __shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
1701  __shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
1702  int row = blockIdx.y;
1703 #else
1704  __shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1705  __shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1706  __shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1707  int* dotmax = x_dotmax[threadIdx.y];
1708  int* dotnxt = x_dotnxt[threadIdx.y];
1709  int* dotidx = x_dotidx[threadIdx.y];
1710  int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
1711 #endif
1712 
1713  int base_address = IMUL(row , num2);
1714  int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;
1715  for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)
1716  {
1717  if(threadIdx.x + i < num2)
1718  {
1719  int v = d_dot[base_address + threadIdx.x + i]; // tex1Dfetch(texDOT, base_address + threadIdx.x + i);
1720  bool test = v > t_dotmax;
1721  t_dotnxt = test? t_dotmax : max(t_dotnxt, v);
1722  t_dotidx = test? (threadIdx.x + i) : t_dotidx;
1723  t_dotmax = test? v: t_dotmax;
1724  }
1725  __syncthreads();
1726  }
1727  dotmax[threadIdx.x] = t_dotmax;
1728  dotnxt[threadIdx.x] = t_dotnxt;
1729  dotidx[threadIdx.x] = t_dotidx;
1730  __syncthreads();
1731 
1732 #pragma unroll
1733  for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)
1734  {
1735  if(threadIdx.x < step)
1736  {
1737  int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
1738  bool test = v2 > v1;
1739  dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);
1740  dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];
1741  dotmax[threadIdx.x] = test? v2 : v1;
1742  }
1743  __syncthreads();
1744  }
1745  if(threadIdx.x == 0)
1746  {
1747  float dist = acos(min(dotmax[0] * 0.000003814697265625f, 1.0));
1748  float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));
1749  //float ratio = dist / distn;
1750  d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//? : -1;
1751  }
1752 
1753 }
1754 
1755 
1756 void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)
1757 {
1758  int num1 = texDot->GetImgHeight();
1759  int num2 = texDot->GetImgWidth();
1760  dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
1761  dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
1762  RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
1763  (int*)texMatch->_cuData, num2, distmax, ratiomax);
1764 }
1765 
1766 #define COLMATCH_BLOCK_WIDTH 32
1767 
1768 void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
1769 {
1770  int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
1771  if(col >= num2) return;
1772  int3 result = d_crt[col];//tex1Dfetch(texCT, col);
1773  int read_idx = col + num2;
1774  for(int i = 1; i < height; ++i, read_idx += num2)
1775  {
1776  int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
1777  result = result.x < temp.x?
1778  make_int3(temp.x, temp.y, max(result.x, temp.z)) :
1779  make_int3(result.x, result.y, max(result.z, temp.x));
1780  }
1781 
1782  float dist = acos(min(result.x * 0.000003814697265625f, 1.0));
1783  float distn = acos(min(result.z * 0.000003814697265625f, 1.0));
1784  //float ratio = dist / distn;
1785  d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//? : -1;
1786 
1787 }
1788 
1789 void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)
1790 {
1791  int height = texCRT->GetImgHeight();
1792  int num2 = texCRT->GetImgWidth();
1793  //texCRT->BindTexture(texCT);
1794  dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);
1795  dim3 block(COLMATCH_BLOCK_WIDTH);
1796  ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
1797 }
1798 
1799 #endif