ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
ProgramCL.cpp
Go to the documentation of this file.
1 // File: ProgramCL.cpp
3 // Author: Changchang Wu
4 // Description : implementation of CL related class.
5 // class ProgramCL A simple wrapper of Cg programs
6 //
7 // Copyright (c) 2007 University of North Carolina at Chapel Hill
8 // All Rights Reserved
9 //
10 // Permission to use, copy, modify and distribute this software and its
11 // documentation for educational, research and non-profit purposes, without
12 // fee, and without a written agreement is hereby granted, provided that the
13 // above copyright notice and the following paragraph appear in all copies.
14 //
15 // The University of North Carolina at Chapel Hill make no representations
16 // about the suitability of this software for any purpose. It is provided
17 // 'as is' without express or implied warranty.
18 //
19 // Please send BUG REPORTS to ccwu@cs.unc.edu
20 //
22 
23 #if defined(CL_SIFTGPU_ENABLED)
24 
25 #include <CL/opencl.h>
26 #include <GL/glew.h>
27 
28 #include <iostream>
29 #include <iomanip>
30 #include <vector>
31 #include <strstream>
32 #include <algorithm>
33 #include <stdlib.h>
34 #include <math.h>
35 #include <string.h>
36 using namespace std;
37 
38 #include "GlobalUtil.h"
39 #include "CLTexImage.h"
40 #include "ProgramCL.h"
41 #include "SiftGPU.h"
42 
43 
44 #if defined(_WIN32)
45  #pragma comment (lib, "OpenCL.lib")
46 #endif
47 
48 #ifndef _INC_WINDOWS
49 #ifndef WIN32_LEAN_AND_MEAN
50  #define WIN32_LEAN_AND_MEAN
51 #endif
52 #include <windows.h>
53 #endif
54 
56 // Construction/Destruction
58 
59 ProgramCL::ProgramCL()
60 {
61  _program = NULL;
62  _kernel = NULL;
63  _valid = 0;
64 }
65 
66 ProgramCL::~ProgramCL()
67 {
68  if(_kernel) clReleaseKernel(_kernel);
69  if(_program) clReleaseProgram(_program);
70 }
71 
72 ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1)
73 {
74  const char * src[1] = {code}; cl_int status;
75 
76  _program = clCreateProgramWithSource(context, 1, src, NULL, &status);
77  if(status != CL_SUCCESS) _valid = 0;
78 
79  status = clBuildProgram(_program, 0, NULL,
81  "-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" :
82  "-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL);
83 
84  if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;}
85  else if(GlobalUtil::_debug) PrintBuildLog(device, 0);
86 
87  _kernel = clCreateKernel(_program, name, &status);
88  if(status != CL_SUCCESS) _valid = 0;
89 }
90 
91 void ProgramCL::PrintBuildLog(cl_device_id device, int all)
92 {
93  char buffer[10240] = "\0";
94  cl_int status = clGetProgramBuildInfo(
95  _program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
96  if(all )
97  {
98  std::cerr << buffer << endl;
99  }else
100  {
101  const char * pos = strstr(buffer, "ptxas");
102  if(pos) std::cerr << pos << endl;
103  }
104 }
105 
108 
109 ProgramBagCL::ProgramBagCL()
110 {
112  _context = NULL; _queue = NULL;
113  s_gray = s_sampling = NULL;
114  s_packup = s_zero_pass = NULL;
115  s_gray_pack = s_unpack = NULL;
116  s_sampling_u = NULL;
117  s_dog_pass = NULL;
118  s_grad_pass = NULL;
119  s_grad_pass2 = NULL;
120  s_unpack_dog = NULL;
121  s_unpack_grd = NULL;
122  s_unpack_key = NULL;
123  s_keypoint = NULL;
124  f_gaussian_skip0 = NULL;
125  f_gaussian_skip1 = NULL;
126  f_gaussian_step = 0;
127 
129  GlobalUtil::StartTimer("Initialize OpenCL");
130  if(!InitializeContext()) return;
132 
133 }
134 
135 
136 
137 ProgramBagCL::~ProgramBagCL()
138 {
139  if(s_gray) delete s_gray;
140  if(s_sampling) delete s_sampling;
141  if(s_zero_pass) delete s_zero_pass;
142  if(s_packup) delete s_packup;
143  if(s_unpack) delete s_unpack;
144  if(s_gray_pack) delete s_gray_pack;
145  if(s_sampling_u) delete s_sampling_u;
146  if(s_dog_pass) delete s_dog_pass;
147  if(s_grad_pass) delete s_grad_pass;
148  if(s_grad_pass2) delete s_grad_pass2;
149  if(s_unpack_dog) delete s_unpack_dog;
150  if(s_unpack_grd) delete s_unpack_grd;
151  if(s_unpack_key) delete s_unpack_key;
152  if(s_keypoint) delete s_keypoint;
153 
154  if(f_gaussian_skip1) delete f_gaussian_skip1;
155 
156  for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
157  {
158  if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i];
159  }
160  if(f_gaussian_step && _gaussian_step_num > 0)
161  {
162  for(int i = 0; i< _gaussian_step_num; i++)
163  {
164  delete f_gaussian_step[i];
165  }
166  delete[] f_gaussian_step;
167  }
168 
170  if(_context) clReleaseContext(_context);
171  if(_queue) clReleaseCommandQueue(_queue);
172 }
173 
174 bool ProgramBagCL::InitializeContext()
175 {
176  cl_uint num_platform, num_device;
177  cl_int status;
178  // Get OpenCL platform count
179  status = clGetPlatformIDs (0, NULL, &num_platform);
180  if (status != CL_SUCCESS || num_platform == 0) return false;
181 
182  cl_platform_id platforms[16];
183  if(num_platform > 16 ) num_platform = 16;
184  status = clGetPlatformIDs (num_platform, platforms, NULL);
185  _platform = platforms[0];
186 
188  status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);
189  if(status != CL_SUCCESS || num_device == 0) return false;
190 
191  // Create the device list
192  cl_device_id* devices = new cl_device_id [num_device];
193  status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
194  _device = (status == CL_SUCCESS? devices[0] : 0); delete[] devices;
195  if(status != CL_SUCCESS) return false;
196 
197 
199  {
200  cl_device_mem_cache_type is_gcache;
201  clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL);
202  if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n";
203  //else if(is_gcache == CL_READ_ONLY_CACHE) std::cout << "Read only cache for global memory\n";
204  //else std::cout << "Read/Write cache for global memory\n";
205  }
206 
207  //context;
209  {
210  cl_context_properties prop[] = {
211  CL_CONTEXT_PLATFORM, (cl_context_properties)_platform,
212  CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
213  CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0 };
214  _context = clCreateContext(prop, 1, &_device, NULL, NULL, &status);
215  if(status != CL_SUCCESS) return false;
216  }else
217  {
218  _context = clCreateContext(0, 1, &_device, NULL, NULL, &status);
219  if(status != CL_SUCCESS) return false;
220  }
221 
222  //command queue
223  _queue = clCreateCommandQueue(_context, _device, 0, &status);
224  return status == CL_SUCCESS;
225 }
226 
227 void ProgramBagCL::InitProgramBag(SiftParam&param)
228 {
229  GlobalUtil::StartTimer("Load Programs");
230  LoadFixedShaders();
231  LoadDynamicShaders(param);
232  if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
234 }
235 
236 
237 void ProgramBagCL::UnloadProgram()
238 {
239 
240 }
241 
242 void ProgramBagCL::FinishCL()
243 {
244  clFinish(_queue);
245 }
246 
247 void ProgramBagCL::LoadFixedShaders()
248 {
249 
250 
251  s_gray = new ProgramCL( "gray",
252  "__kernel void gray(__read_only image2d_t input, __write_only image2d_t output) {\n"
253  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
254  "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
255  "float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
256  "float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
257  "float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
258  "write_imagef(output, coord, result); }", _context, _device );
259 
260 
261  s_sampling = new ProgramCL("sampling",
262  "__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output,\n"
263  " int width, int height) {\n"
264  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
265  "int x = get_global_id(0), y = get_global_id(1); \n"
266  "if( x >= width || y >= height) return;\n"
267  "int xa = x + x, ya = y + y; \n"
268  "int xb = xa + 1, yb = ya + 1; \n"
269  "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
270  "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
271  "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
272  "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
273  "float4 result = (float4) (v1, v2, v3, v4);"
274  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
275 
276  s_sampling_k = new ProgramCL("sampling_k",
277  "__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
278  " int width, int height,\n"
279  " int step, int halfstep) {\n"
280  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
281  "int x = get_global_id(0), y = get_global_id(1); \n"
282  "if( x >= width || y >= height) return;\n"
283  "int xa = x * step, ya = y *step; \n"
284  "int xb = xa + halfstep, yb = ya + halfstep; \n"
285  "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
286  "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
287  "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
288  "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
289  "float4 result = (float4) (v1, v2, v3, v4);"
290  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
291 
292 
293  s_sampling_u = new ProgramCL("sampling_u",
294  "__kernel void sampling_u(__read_only image2d_t input, \n"
295  " __write_only image2d_t output,\n"
296  " int width, int height,\n"
297  " float step, float halfstep) {\n"
298  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
299  "int x = get_global_id(0), y = get_global_id(1); \n"
300  "if( x >= width || y >= height) return;\n"
301  "float xa = x * step, ya = y *step; \n"
302  "float xb = xa + halfstep, yb = ya + halfstep; \n"
303  "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
304  "float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
305  "float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
306  "float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
307  "float4 result = (float4) (v1, v2, v3, v4);"
308  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
309 
310 
311  s_zero_pass = new ProgramCL("zero_pass",
312  "__kernel void zero_pass(__write_only image2d_t output){\n"
313  "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
314  "write_imagef(output, coord, (float4)(0.0));}", _context, _device);
315 
316  s_packup = new ProgramCL("packup",
317  "__kernel void packup(__global float* input, __write_only image2d_t output,\n"
318  " int twidth, int theight, int width){\n"
319  "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
320  "if(coord.x >= twidth || coord.y >= theight) return;\n"
321  "int index0 = (coord.y + coord.y) * width; \n"
322  "int index1 = index0 + coord.x;\n"
323  "int x2 = min(width -1, coord.x); \n"
324  "float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
325  "int index2 = index1 + width; \n"
326  "float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
327  "write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
328 
329  s_dog_pass = new ProgramCL("dog_pass",
330  "__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
331  " __write_only image2d_t dog, int width, int height) {\n"
332  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
333  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
334  "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
335  "if( coord.x >= width || coord.y >= height) return;\n"
336  "float4 cc = read_imagef(tex , sampler, coord); \n"
337  "float4 cp = read_imagef(texp, sampler, coord);\n"
338  "write_imagef(dog, coord, cc - cp); }\n", _context, _device);
339 
340  s_grad_pass = new ProgramCL("grad_pass",
341  "__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
342  " __write_only image2d_t dog, int width, int height,\n"
343  " __write_only image2d_t grad, __write_only image2d_t rot) {\n"
344  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
345  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
346  "int x = get_global_id(0), y = get_global_id(1); \n"
347  "if( x >= width || y >= height) return;\n"
348  "int2 coord = (int2) (x, y);\n"
349  "float4 cc = read_imagef(tex , sampler, coord); \n"
350  "float4 cp = read_imagef(texp, sampler, coord);\n"
351  "float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
352  "float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
353  "float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
354  "float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
355  "write_imagef(dog, coord, cc - cp); \n"
356  "float4 dx = (float4)(cc.y - cl.x, cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
357  "float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
358  "write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
359  "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
360 
361  s_grad_pass2 = new ProgramCL("grad_pass2",
362  "#define BLOCK_DIMX 32\n"
363  "#define BLOCK_DIMY 14\n"
364  "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
365  "__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
366  " __write_only image2d_t dog, int width, int height,\n"
367  " __write_only image2d_t grd, __write_only image2d_t rot){\n"//, __local float* block) {\n"
368  "__local float block[BLOCK_SIZE * 4]; \n"
369  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
370  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
371  "int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
372  " get_global_id(1) - get_group_id(1) * 2- 1); \n"
373  "int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
374  "float4 cc = read_imagef(tex, sampler, coord);\n"
375  "block[idx ] = cc.x;\n"
376  "block[idx + BLOCK_SIZE ] = cc.y;\n"
377  "block[idx + BLOCK_SIZE * 2] = cc.z;\n"
378  "block[idx + BLOCK_SIZE * 3] = cc.w;\n"
379  "barrier(CLK_LOCAL_MEM_FENCE);\n"
380  "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
381  "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
382  "if( coord.x >= width) return; \n"
383  "if( coord.y >= height) return;\n"
384  "float4 cp = read_imagef(texp, sampler, coord);\n"
385  "float4 dx = (float4)( cc.y - block[idx - 1 + BLOCK_SIZE], \n"
386  " block[idx + 1] - cc.x, \n"
387  " cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
388  " block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
389  "float4 dy = (float4)( cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
390  " cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
391  //" cc.zw - block[idx - BLOCK_DIMX].zw, \n"
392  " block[idx + BLOCK_DIMX] - cc.x,\n "
393  " block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
394  //" block[idx + BLOCK_DIMX].xy - cc.xy);\n"
395  "write_imagef(dog, coord, cc - cp); \n"
396  "write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
397  "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
398 }
399 
400 void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
401 {
402  LoadKeypointShader();
403  LoadGenListShader(param._dog_level_num, 0);
404  CreateGaussianFilters(param);
405 }
406 
407 
408 void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam&param)
409 {
410  float sigma = param.GetInitialSmoothSigma(octave_min);
411  if(sigma == 0)
412  {
413  f_gaussian_skip0 = NULL;
414  }else
415  {
416  for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
417  {
418  if(f_gaussian_skip0_v[i]->_id == octave_min)
419  {
420  f_gaussian_skip0 = f_gaussian_skip0_v[i];
421  return ;
422  }
423  }
424  FilterCL * filter = CreateGaussianFilter(sigma);
425  filter->_id = octave_min;
426  f_gaussian_skip0_v.push_back(filter);
427  f_gaussian_skip0 = filter;
428  }
429 
430 }
431 
432 void ProgramBagCL::CreateGaussianFilters(SiftParam&param)
433 {
434  if(param._sigma_skip0>0.0f)
435  {
436  f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0);
437  f_gaussian_skip0->_id = GlobalUtil::_octave_min_default;
438  f_gaussian_skip0_v.push_back(f_gaussian_skip0);
439  }
440  if(param._sigma_skip1>0.0f)
441  {
442  f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1);
443  }
444 
445  f_gaussian_step = new FilterCL*[param._sigma_num];
446  for(int i = 0; i< param._sigma_num; i++)
447  {
448  f_gaussian_step[i] = CreateGaussianFilter(param._sigma[i]);
449  }
450  _gaussian_step_num = param._sigma_num;
451 }
452 
453 
454 FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
455 {
456  //pixel inside 3*sigma box
457  int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
458  int width = 2*sz + 1;
459 
460  //filter size truncation
462  {
463  std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl;
465  width = 2 * sz + 1;
466  }
467 
468  int i;
469  float * kernel = new float[width];
470  float rv = 1.0f/(sigma*sigma);
471  float v, ksum =0;
472 
473  // pre-compute filter
474  for( i = -sz ; i <= sz ; ++i)
475  {
476  kernel[i+sz] = v = exp(-0.5f * i * i *rv) ;
477  ksum += v;
478  }
479 
480  //normalize the kernel
481  rv = 1.0f / ksum;
482  for(i = 0; i< width ;i++) kernel[i]*=rv;
483 
484  FilterCL * filter = CreateFilter(kernel, width);
485  delete [] kernel;
486  if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl;
487  return filter;
488 }
489 
490 FilterCL* ProgramBagCL::CreateFilter(float kernel[], int width)
491 {
492  FilterCL * filter = new FilterCL;
493  filter->s_shader_h = CreateFilterH(kernel, width);
494  filter->s_shader_v = CreateFilterV(kernel, width);
495  filter->_size = width;
496  filter->_id = 0;
497  return filter;
498 }
499 
500 ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width)
501 {
502  int halfwidth = width >>1;
503  float * pf = kernel + halfwidth;
504  int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up
505  int npixel = (nhpixel<<1)+1;//
506  float weight[3];
507 
509  char buffer[10240];
510  ostrstream out(buffer, 10240);
511  out<<setprecision(8);
512 
513 
514  //CL_DEVICE_IMAGE2D_MAX_WIDTH
515  out<<
516  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
517  "__kernel void filter_h(__read_only image2d_t input, \n"
518  " __write_only image2d_t output, int width_, int height_) {\n"
519  "int x = get_global_id(0);\n"
520  "int y = get_global_id(1); \n"
521  "if( x > width_ || y > height_) return; \n"
522  "float4 pc; int2 coord; \n"
523  "float4 result = (float4)(0.0);\n";
524  for(int i = 0 ; i < npixel ; i++)
525  {
526  out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n";
527  out<<"pc= read_imagef(input, sampler, coord);\n";
529  out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
530  //for each sub-pixel j in center, the weight of sub-pixel k
531  int xw = (i - nhpixel)*2;
532  for(int j = 0; j < 3; j++)
533  {
534  int xwn = xw + j -1;
535  weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
536  }
537  if(weight[1] == 0.0)
538  {
539  out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n";
540  }
541  else
542  {
543  out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n";
544  out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n";
545  }
546  }
547  out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
548  return new ProgramCL("filter_h", buffer, _context, _device);
549 }
550 
551 
552 
553 ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width)
554 {
555 
556  int halfwidth = width >>1;
557  float * pf = kernel + halfwidth;
558  int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up
559  int npixel = (nhpixel<<1)+1;//
560  float weight[3];
561 
563  char buffer[10240];
564  ostrstream out(buffer, 10240);
565  out<<setprecision(8);
566 
567 
568  //CL_DEVICE_IMAGE2D_MAX_WIDTH
569  out<<
570  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
571  "__kernel void filter_v(__read_only image2d_t input, \n"
572  " __write_only image2d_t output, int width_, int height_) {\n"
573  "int x = get_global_id(0);\n"
574  "int y = get_global_id(1); \n"
575  "if( x > width_ || y >= height_) return; \n"
576  "float4 pc; int2 coord; \n"
577  "float4 result = (float4)(0.0);\n";
578  for(int i = 0 ; i < npixel ; i++)
579  {
580  out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n";
581  out<<"pc= read_imagef(input, sampler, coord);\n";
583  out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
584  //for each sub-pixel j in center, the weight of sub-pixel k
585  int xw = (i - nhpixel)*2;
586  for(int j = 0; j < 3; j++)
587  {
588  int xwn = xw + j -1;
589  weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
590  }
591  if(weight[1] == 0.0)
592  {
593  out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n";
594  }
595  else
596  {
597  out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n";
598  out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n";
599  }
600  }
601  out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
602  return new ProgramCL("filter_v", buffer, _context, _device);
603 
604 }
605 
606 void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
607 {
608  cl_kernel kernelh = filter->s_shader_h->_kernel;
609  cl_kernel kernelv = filter->s_shader_v->_kernel;
611 
612  cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
613  cl_int w_ = w - 1, h_ = h - 1;
614 
615  size_t dim0 = 16, dim1 = 16;
616  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
617 
618  clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
619  clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
620  clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
621  clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
622  status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL);
623  CheckErrorCL(status, "ProgramBagCL::FilterImageH");
624  if(status != CL_SUCCESS) return;
625 
626  clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
627  clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
628  clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
629  clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
630  size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0};
631  status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
632  CheckErrorCL(status, "ProgramBagCL::FilterImageV");
633  //clReleaseEvent(event);
634 }
635 
636 void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale)
637 {
638  cl_kernel kernel= s_sampling_u->_kernel;
639  float scale = 1.0f / (1 << log_scale);
640  float offset = scale * 0.5f;
641  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
642  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
643  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
644  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
645  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
646  clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale));
647  clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset));
648 
649  size_t dim0 = 16, dim1 = 16;
650  //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
651  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
652  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
653  CheckErrorCL(status, "ProgramBagCL::SampleImageU");
654 }
655 
656 void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
657 {
658  cl_kernel kernel;
659  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
660  if(log_scale == 1)
661  {
662  kernel = s_sampling->_kernel;
663  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
664  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
665  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
666  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
667  }else
668  {
669  cl_int fullstep = (1 << log_scale);
670  cl_int halfstep = fullstep >> 1;
671  kernel = s_sampling_k->_kernel;
672  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
673  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
674  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
675  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
676  clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
677  clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep));
678  }
679  size_t dim0 = 128, dim1 = 1;
680  //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
681  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
682  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
683  CheckErrorCL(status, "ProgramBagCL::SampleImageD");
684 }
685 
686 void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
687 {
688  if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
689 }
690 
691 void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
692 {
693  if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
694 }
695 
696 void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
697 {
698  int margin = 0, use_gm2 = 1;
699  bool both_grad_dog = rot->_clData && grad->_clData;
700  cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight();
701  cl_kernel kernel ; size_t dim0, dim1;
702  if(!both_grad_dog) {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; }
703  else if(use_gm2) {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; }
704  else {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; }
705  size_t gsz[2] = { (w + dim0 - 1 - margin) / (dim0 - margin) * dim0,
706  (h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
707  size_t lsz[2] = {dim0, dim1};
708  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData));
709  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData));
710  clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData));
711  clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
712  clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
713  if(both_grad_dog)
714  {
715  clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData));
716  clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData));
717  }
719  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
720  CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
721 }
722 
723 
724 void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge)
725 {
726  cl_kernel kernel = s_keypoint->_kernel;
727  cl_int w = key->GetImgWidth(), h = key->GetImgHeight();
728  float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f);
729  float threshold1 = Tdog;
730  float threshold2 = (Tedge+1)*(Tedge+1)/Tedge;
731 
732  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
733  clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData));
734  clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData));
735  clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData));
736  clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0));
737  clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1));
738  clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2));
739  clSetKernelArg(kernel, 7, sizeof(cl_int), &(w));
740  clSetKernelArg(kernel, 8, sizeof(cl_int), &(h));
741 
742  size_t dim0 = 8, dim1 = 8;
743  //if( w * h / dim0 / dim1 < 16) dim1 /= 2;
744  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
745  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
746  CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
747 }
748 
749 void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
750 {
751  cl_kernel kernel = s_unpack->_kernel;
752  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
753  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
754  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
755  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
756  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
757  const size_t dim0 = 16, dim1 = 16;
758  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
759  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
760 
761  CheckErrorCL(status, "ProgramBagCL::UnpackImage");
762  FinishCL();
763 
764 }
765 
766 void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
767 {
768  if(s_unpack_dog == NULL) return;
769  cl_kernel kernel = s_unpack_dog->_kernel;
770  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
771  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
772  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
773  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
774  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
775  const size_t dim0 = 16, dim1 = 16;
776  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
777  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
778 
779  CheckErrorCL(status, "ProgramBagCL::UnpackImage");
780  FinishCL();
781 }
782 
783 void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
784 {
785  if(s_unpack_grd == NULL) return;
786  cl_kernel kernel = s_unpack_grd->_kernel;
787  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
788  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
789  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
790  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
791  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
792  const size_t dim0 = 16, dim1 = 16;
793  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
794  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
795 
796  CheckErrorCL(status, "ProgramBagCL::UnpackImage");
797  FinishCL();
798 }
799 void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
800 {
801  if(s_unpack_key == NULL) return;
802  cl_kernel kernel = s_unpack_key->_kernel;
803  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
804  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
805  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData));
806  clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData));
807  clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
808  clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
809  const size_t dim0 = 16, dim1 = 16;
810  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
811  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
812 
813  CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
814  FinishCL();
815 }
816 void ProgramBagCL::LoadDescriptorShader()
817 {
819  LoadDescriptorShaderF2();
820 }
821 
822 void ProgramBagCL::LoadDescriptorShaderF2()
823 {
824 
825 }
826 
827 void ProgramBagCL::LoadOrientationShader(void)
828 {
829 
830 }
831 
832 void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev)
833 {
834 
835 }
836 
837 void ProgramBagCL::LoadKeypointShader()
838 {
839  int i; char buffer[20240];
840  ostrstream out(buffer, 20240);
841  streampos pos;
842 
843  //tex(X)(Y)
844  //X: (CLR) (CENTER 0, LEFT -1, RIGHT +1)
845  //Y: (CDU) (CENTER 0, DOWN -1, UP +1)
846  out<<
847  "__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
848  " __read_only image2d_t texD, __write_only image2d_t texK,\n"
849  " float THRESHOLD0, float THRESHOLD1, \n"
850  " float THRESHOLD2, int width, int height)\n"
851  "{\n"
852  " sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
853  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
854  " int x = get_global_id(0), y = get_global_id(1);\n"
855  " if(x >= width || y >= height) return; \n"
856  " int xp = x - 1, xn = x + 1;\n"
857  " int yp = y - 1, yn = y + 1;\n"
858  " int2 coord0 = (int2) (x, y); \n"
859  " int2 coord1 = (int2) (xp, y); \n"
860  " int2 coord2 = (int2) (xn, y); \n"
861  " int2 coord3 = (int2) (x, yp); \n"
862  " int2 coord4 = (int2) (x, yn); \n"
863  " int2 coord5 = (int2) (xp, yp); \n"
864  " int2 coord6 = (int2) (xp, yn); \n"
865  " int2 coord7 = (int2) (xn, yp); \n"
866  " int2 coord8 = (int2) (xn, yn); \n"
867  " float4 ccc = read_imagef(tex, sampler,coord0);\n"
868  " float4 clc = read_imagef(tex, sampler,coord1);\n"
869  " float4 crc = read_imagef(tex, sampler,coord2);\n"
870  " float4 ccd = read_imagef(tex, sampler,coord3);\n"
871  " float4 ccu = read_imagef(tex, sampler,coord4);\n"
872  " float4 cld = read_imagef(tex, sampler,coord5);\n"
873  " float4 clu = read_imagef(tex, sampler,coord6);\n"
874  " float4 crd = read_imagef(tex, sampler,coord7);\n"
875  " float4 cru = read_imagef(tex, sampler,coord8);\n"
876  " float4 cc = ccc;\n"
877  " float4 v1[4], v2[4];\n"
878  " v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
879  " v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
880  " v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
881  " v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
882  " v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
883  " v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
884  " v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
885  " v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
886  " float4 key4 = (float4)(0); \n";
887  //test against 8 neighbours
888  //use variable to identify type of extremum
889  //1.0 for local maximum and -1.0 for minimum
890  for(i = 0; i < 4; ++i)
891  out<<
892  " if(cc.s"<<i<<" > THRESHOLD0){ \n"
893  " if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n"
894  " }else if(cc.s"<<i<<" < -THRESHOLD0){ \n"
895  " if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n"
896  " }";
897 
898  out<<
899  " if(x ==0) {key4.x = key4.z= 0; }\n"
900  " else if(x + 1 == width) {key4.y = key4.w = 0;}\n"
901  " if(y ==0) {key4.x = key4.y = 0; }\n"
902  " else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
903  " float4 ak = fabs(key4); \n"
904  " float keysum = ak.x + ak.y + ak.z + ak.w; \n"
905  " float4 result = (float4)(0.0);\n"
906  " if(keysum == 1.0) {\n"
907  " float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
908 
909  //do edge supression first..
910  //vector v1 is < (-1, 0), (1, 0), (0,-1), (0, 1)>
911  //vector v2 is < (-1,-1), (-1,1), (1,-1), (1, 1)>
912  for(i = 0; i < 4; ++i)
913  out <<
914  " if(key4.s"<<i<<" != 0)\n"
915  " {\n"
916  " float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n"
917  " float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n"
918  " float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n"
919  " fx["<<i<<"] = D5.x; fy["<<i<<"] = D5.y ;\n"
920  " fxx["<<i<<"] = D2.x + D2.y;\n"
921  " fyy["<<i<<"] = D2.z + D2.w;\n"
922  " fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n"
923  " float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n"
924  " float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
925  " float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n"
926  " if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
927  " }\n";
928 
929  out <<
930  " if(keysum == 1) {\n";
932  //read 9 pixels of upper/lower level
933  out<<
934  " float4 v4[4], v5[4], v6[4];\n"
935  " ccc = read_imagef(texU, sampler,coord0);\n"
936  " clc = read_imagef(texU, sampler,coord1);\n"
937  " crc = read_imagef(texU, sampler,coord2);\n"
938  " ccd = read_imagef(texU, sampler,coord3);\n"
939  " ccu = read_imagef(texU, sampler,coord4);\n"
940  " cld = read_imagef(texU, sampler,coord5);\n"
941  " clu = read_imagef(texU, sampler,coord6);\n"
942  " crd = read_imagef(texU, sampler,coord7);\n"
943  " cru = read_imagef(texU, sampler,coord8);\n"
944  " float4 cu = ccc;\n"
945  " v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
946  " v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
947  " v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
948  " v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
949  " v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
950  " v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
951  " v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
952  " v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
953 
954  for(i = 0; i < 4; ++i)
955  out <<
956  " if(key4.s"<<i<<" == 1.0)\n"
957  " {\n"
958  " if(cc.s"<<i<<" < cu.s"<<i<<" || \n"
959  " any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
960  " }else if(key4.s"<<i<<" == -1.0)\n"
961  " {\n"
962  " if(cc.s"<<i<<" > cu.s"<<i<<" || \n"
963  " any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n"
964  " }\n";
965 
966  out <<
967  " if(keysum == 1.0) { \n";
968  out <<
969  " ccc = read_imagef(texD, sampler,coord0);\n"
970  " clc = read_imagef(texD, sampler,coord1);\n"
971  " crc = read_imagef(texD, sampler,coord2);\n"
972  " ccd = read_imagef(texD, sampler,coord3);\n"
973  " ccu = read_imagef(texD, sampler,coord4);\n"
974  " cld = read_imagef(texD, sampler,coord5);\n"
975  " clu = read_imagef(texD, sampler,coord6);\n"
976  " crd = read_imagef(texD, sampler,coord7);\n"
977  " cru = read_imagef(texD, sampler,coord8);\n"
978  " float4 cd = ccc;\n"
979  " v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
980  " v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
981  " v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
982  " v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
983  " v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
984  " v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
985  " v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
986  " v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
987  for(i = 0; i < 4; ++i)
988  out <<
989  " if(key4.s"<<i<<" == 1.0)\n"
990  " {\n"
991  " if(cc.s"<<i<<" < cd.s"<<i<<" ||\n"
992  " any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
993  " }else if(key4.s"<<i<<" == -1.0)\n"
994  " {\n"
995  " if(cc.s"<<i<<" > cd.s"<<i<<" ||\n"
996  " any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
997  " }\n";
998 
999  out <<
1000  " if(keysum==1.0) {\n";
1003  {
1004  out <<
1005  " float4 offset = (float4)(0); \n";
1006  for(i = 1; i < 4; ++i)
1007  out <<
1008  " if(key4.s"<<i<<" != 0) \n"
1009  " {\n"
1010  " cu.s0 = cu.s"<<i<<"; cd.s0 = cd.s"<<i<<"; cc.s0 = cc.s"<<i<<"; \n"
1011  " v4[0] = v4["<<i<<"]; v5[0] = v5["<<i<<"]; \n"
1012  " fxy[0] = fxy["<<i<<"]; fxx[0] = fxx["<<i<<"]; fyy[0] = fyy["<<i<<"]; \n"
1013  " fx[0] = fx["<<i<<"]; fy[0] = fy["<<i<<"]; \n"
1014  " }\n";
1015 
1016  out <<
1017  " float fs = 0.5*( cu.s0 - cd.s0 ); \n"
1018  " float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
1019  " float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
1020  " float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
1021  " float4 A0, A1, A2 ; \n"
1022  " A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]); \n"
1023  " A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]); \n"
1024  " A2 = (float4)(fxs, fys, fss, -fs); \n"
1025  " float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0)); \n"
1026  " float maxa = max(max(x3.x, x3.y), x3.z); \n"
1027  " if(maxa >= 1e-10 ) \n"
1028  " { \n"
1029  " if(x3.y ==maxa ) \n"
1030  " { \n"
1031  " float4 TEMP = A1; A1 = A0; A0 = TEMP; \n"
1032  " }else if( x3.z == maxa ) \n"
1033  " { \n"
1034  " float4 TEMP = A2; A2 = A0; A0 = TEMP; \n"
1035  " } \n"
1036  " A0 /= A0.x; \n"
1037  " A1 -= A1.x * A0; \n"
1038  " A2 -= A2.x * A0; \n"
1039  " float2 x2 = fabs((float2)(A1.y, A2.y)); \n"
1040  " if( x2.y > x2.x ) \n"
1041  " { \n"
1042  " float4 TEMP = A2.yzwx; \n"
1043  " A2.yzw = A1.yzw; \n"
1044  " A1.yzw = TEMP.xyz; \n"
1045  " x2.x = x2.y; \n"
1046  " } \n"
1047  " if(x2.x >= 1e-10) { \n"
1048  " A1.yzw /= A1.y; \n"
1049  " A2.yzw -= A2.y * A1.yzw; \n"
1050  " if(fabs(A2.z) >= 1e-10) {\n"
1051  " offset.z = A2.w /A2.z; \n"
1052  " offset.y = A1.w - offset.z*A1.z; \n"
1053  " offset.x = A0.w - offset.z*A0.z - offset.y*A0.y; \n"
1054  " if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
1055  " || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
1056  " }\n"
1057  " }\n"
1058  " }\n"
1059  <<"\n"
1060  " float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1061  " result = (float4)(keyv, offset.xyz);\n"
1062  " }}}}\n"
1063  " write_imagef(texK, coord0, result);\n "
1064  "}\n" <<'\0';
1065  }
1066  else
1067  {
1068  out << "\n"
1069  " float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1070  " result = (float4)(keyv, 0, 0, 0);\n"
1071  " }}}}\n"
1072  " write_imagef(texK, coord0, result);\n "
1073  "}\n" <<'\0';
1074  }
1075 
1076  s_keypoint = new ProgramCL("keypoint", buffer, _context, _device);
1077 }
1078 
1079 void ProgramBagCL::LoadDisplayShaders()
1080 {
1081  //"uniform sampler2DRect tex; void main(){\n"
1082  //"vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy); bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));\n"
1083  //"float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}");
1084  s_unpack = new ProgramCL("main",
1085  "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1086  " int width, int height) {\n"
1087  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1088  "int x = get_global_id(0), y = get_global_id(1); \n"
1089  "if(x >= width || y >= height) return;\n"
1090  "int xx = x / 2, yy = y / 2; \n"
1091  "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1092  "float v1 = (x & 1 ? vv.w : vv.z); \n"
1093  "float v2 = (x & 1 ? vv.y : vv.x);\n"
1094  "float v = y & 1 ? v1 : v2;\n"
1095  "float4 result = (float4) (v, v, v, 1);"
1096  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1097 
1098  s_unpack_dog = new ProgramCL("main",
1099  "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1100  " int width, int height) {\n"
1101  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1102  "int x = get_global_id(0), y = get_global_id(1); \n"
1103  "if(x >= width || y >= height) return;\n"
1104  "int xx = x / 2, yy = y / 2; \n"
1105  "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1106  "float v1 = (x & 1 ? vv.w : vv.z); \n"
1107  "float v2 = (x & 1 ? vv.y : vv.x);\n"
1108  "float v0 = y & 1 ? v1 : v2;\n"
1109  "float v = 0.5 + 20.0 * v0;\n "
1110  "float4 result = (float4) (v, v, v, 1);"
1111  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1112 
1113  s_unpack_grd = new ProgramCL("main",
1114  "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1115  " int width, int height) {\n"
1116  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1117  "int x = get_global_id(0), y = get_global_id(1); \n"
1118  "if(x >= width || y >= height) return;\n"
1119  "int xx = x / 2, yy = y / 2; \n"
1120  "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1121  "float v1 = (x & 1 ? vv.w : vv.z); \n"
1122  "float v2 = (x & 1 ? vv.y : vv.x);\n"
1123  "float v0 = y & 1 ? v1 : v2;\n"
1124  "float v = 5.0 * v0;\n "
1125  "float4 result = (float4) (v, v, v, 1);"
1126  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1127 
1128  s_unpack_key = new ProgramCL("main",
1129  "__kernel void main(__read_only image2d_t dog,\n"
1130  " __read_only image2d_t key,\n"
1131  " __write_only image2d_t output,\n"
1132  " int width, int height) {\n"
1133  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1134  "int x = get_global_id(0), y = get_global_id(1); \n"
1135  "if(x >= width || y >= height) return;\n"
1136  "int xx = x / 2, yy = y / 2; \n"
1137  "float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
1138  "int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
1139  "int k1 = (x & 1 ? cc.w : cc.z); \n"
1140  "int k2 = (x & 1 ? cc.y : cc.x);\n"
1141  "int k0 = y & 1 ? k1 : k2;\n"
1142  "float4 result;\n"
1143  "if(k0 != 0){\n"
1144  " //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
1145  " result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0, 0.0, 1.0)); \n"
1146  "}else{"
1147  "float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
1148  "float v1 = (x & 1 ? vv.w : vv.z); \n"
1149  "float v2 = (x & 1 ? vv.y : vv.x);\n"
1150  "float v0 = y & 1 ? v1 : v2;\n"
1151  "float v = 0.5 + 20.0 * v0;\n "
1152  "result = (float4) (v, v, v, 1);"
1153  "}\n"
1154  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1155 }
1156 
1157 
1158 void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax)
1159 {
1160 
1161 }
1162 
1163 void ProgramBagCL::SetGradPassParam(int texP)
1164 {
1165 
1166 }
1167 
1168 void ProgramBagCL::SetGenListEndParam(int ktex)
1169 {
1170 
1171 }
1172 
1173 void ProgramBagCL::SetDogTexParam(int texU, int texD)
1174 {
1175 
1176 }
1177 
1178 void ProgramBagCL::SetGenListInitParam(int w, int h)
1179 {
1180  float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f, (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f};
1181 
1182 }
1183 
1184 
1185 void ProgramBagCL::SetGenListStartParam(float width, int tex0)
1186 {
1187 
1188 }
1189 
1190 
1191 
1192 void ProgramBagCL::SetGenListStepParam(int tex, int tex0)
1193 {
1194 
1195 }
1196 
1197 void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size)
1198 {
1199 
1200 }
1201 
1202 void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step)
1203 {
1204 
1205 }
1206 
1207 
1208 void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step)
1209 {
1210 
1211 
1212 }
1213 
1214 void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth, float width, float height, float sigma)
1215 {
1216 
1217 }
1218 
1219 
1220 
1221 const char* ProgramBagCL::GetErrorString(cl_int error)
1222 {
1223  static const char* errorString[] = {
1224  "CL_SUCCESS",
1225  "CL_DEVICE_NOT_FOUND",
1226  "CL_DEVICE_NOT_AVAILABLE",
1227  "CL_COMPILER_NOT_AVAILABLE",
1228  "CL_MEM_OBJECT_ALLOCATION_FAILURE",
1229  "CL_OUT_OF_RESOURCES",
1230  "CL_OUT_OF_HOST_MEMORY",
1231  "CL_PROFILING_INFO_NOT_AVAILABLE",
1232  "CL_MEM_COPY_OVERLAP",
1233  "CL_IMAGE_FORMAT_MISMATCH",
1234  "CL_IMAGE_FORMAT_NOT_SUPPORTED",
1235  "CL_BUILD_PROGRAM_FAILURE",
1236  "CL_MAP_FAILURE",
1237  "",
1238  "",
1239  "",
1240  "",
1241  "",
1242  "",
1243  "",
1244  "",
1245  "",
1246  "",
1247  "",
1248  "",
1249  "",
1250  "",
1251  "",
1252  "",
1253  "",
1254  "CL_INVALID_VALUE",
1255  "CL_INVALID_DEVICE_TYPE",
1256  "CL_INVALID_PLATFORM",
1257  "CL_INVALID_DEVICE",
1258  "CL_INVALID_CONTEXT",
1259  "CL_INVALID_QUEUE_PROPERTIES",
1260  "CL_INVALID_COMMAND_QUEUE",
1261  "CL_INVALID_HOST_PTR",
1262  "CL_INVALID_MEM_OBJECT",
1263  "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
1264  "CL_INVALID_IMAGE_SIZE",
1265  "CL_INVALID_SAMPLER",
1266  "CL_INVALID_BINARY",
1267  "CL_INVALID_BUILD_OPTIONS",
1268  "CL_INVALID_PROGRAM",
1269  "CL_INVALID_PROGRAM_EXECUTABLE",
1270  "CL_INVALID_KERNEL_NAME",
1271  "CL_INVALID_KERNEL_DEFINITION",
1272  "CL_INVALID_KERNEL",
1273  "CL_INVALID_ARG_INDEX",
1274  "CL_INVALID_ARG_VALUE",
1275  "CL_INVALID_ARG_SIZE",
1276  "CL_INVALID_KERNEL_ARGS",
1277  "CL_INVALID_WORK_DIMENSION",
1278  "CL_INVALID_WORK_GROUP_SIZE",
1279  "CL_INVALID_WORK_ITEM_SIZE",
1280  "CL_INVALID_GLOBAL_OFFSET",
1281  "CL_INVALID_EVENT_WAIT_LIST",
1282  "CL_INVALID_EVENT",
1283  "CL_INVALID_OPERATION",
1284  "CL_INVALID_GL_OBJECT",
1285  "CL_INVALID_BUFFER_SIZE",
1286  "CL_INVALID_MIP_LEVEL",
1287  "CL_INVALID_GLOBAL_WORK_SIZE",
1288  };
1289 
1290  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
1291 
1292  const int index = -error;
1293 
1294  return (index >= 0 && index < errorCount) ? errorString[index] : "";
1295 }
1296 
1297 bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location)
1298 {
1299  if(error == CL_SUCCESS) return true;
1300  const char *errstr = GetErrorString(error);
1301  if(errstr && errstr[0]) std::cerr << errstr;
1302  else std::cerr << "Error " << error;
1303  if(location) std::cerr << " at " << location;
1304  std::cerr << "\n";
1305  exit(0);
1306  return false;
1307 
1308 }
1309 
1310 
1313 
1314 void ProgramBagCLN::LoadFixedShaders()
1315 {
1316  s_sampling = new ProgramCL("sampling",
1317  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1318  "__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output, "
1319  " int width, int height) {\n"
1320  "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1321  "if( coord.x >= width || coord.y >= height) return;\n"
1322  "write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }" , _context, _device);
1323 
1324  s_sampling_k = new ProgramCL("sampling_k",
1325  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1326  "__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
1327  " int width, int height, int step) {\n"
1328  "int x = get_global_id(0), y = get_global_id(1); \n"
1329  "if( x >= width || y >= height) return;\n"
1330  "int xa = x * step, ya = y *step; \n"
1331  "float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
1332  "write_imagef(output, (int2) (x, y), v1); }" , _context, _device);
1333 
1334 
1335  s_sampling_u = new ProgramCL("sampling_u",
1336  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
1337  "__kernel void sampling_u(__read_only image2d_t input, \n"
1338  " __write_only image2d_t output,\n"
1339  " int width, int height, float step) {\n"
1340  "int x = get_global_id(0), y = get_global_id(1); \n"
1341  "if( x >= width || y >= height) return;\n"
1342  "float xa = x * step, ya = y *step; \n"
1343  "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
1344  "write_imagef(output, (int2) (x, y), (float4)(v1)); }" , _context, _device);
1345 
1346  s_dog_pass = new ProgramCL("dog_pass",
1347  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1348  "__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
1349  " __write_only image2d_t dog, int width, int height) {\n"
1350  "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1351  "if( coord.x >= width || coord.y >= height) return;\n"
1352  "float cc = read_imagef(tex , sampler, coord).x; \n"
1353  "float cp = read_imagef(texp, sampler, coord).x;\n"
1354  "write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device);
1355 
1356  s_grad_pass = new ProgramCL("grad_pass",
1357  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1358  "__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
1359  " __write_only image2d_t dog, int width, int height, \n"
1360  " __write_only image2d_t grad, __write_only image2d_t rot) {\n"
1361  "int x = get_global_id(0), y = get_global_id(1); \n"
1362  "if( x >= width || y >= height) return;\n"
1363  "int2 coord = (int2) (x, y);\n"
1364  "float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
1365  "float cc = read_imagef(tex , sampler, coord).x; \n"
1366  "float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
1367  "float cp = read_imagef(texp, sampler, coord).x;\n"
1368  "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1369  "float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
1370  "float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
1371  "float dx = cr - cl, dy = cu - cd; \n"
1372  "float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
1373  "write_imagef(grad, coord, (float4)(gg));\n"
1374  "float oo = atan2(dy, dx + FLT_MIN);\n"
1375  "write_imagef(rot, coord, (float4)(oo));}\n", _context, _device);
1376 
1377  s_grad_pass2 = new ProgramCL("grad_pass2",
1378  "#define BLOCK_DIMX 32\n"
1379  "#define BLOCK_DIMY 14\n"
1380  "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
1381  "__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
1382  " __write_only image2d_t dog, int width, int height,\n"
1383  " __write_only image2d_t grd, __write_only image2d_t rot){\n"//, __local float* block) {\n"
1384  "__local float block[BLOCK_SIZE]; \n"
1385  "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1386  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1387  "int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
1388  " get_global_id(1) - get_group_id(1) * 2 - 1); \n"
1389  "int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
1390  "float cc = read_imagef(tex, sampler, coord).x;\n"
1391  "block[idx] = cc;\n"
1392  "barrier(CLK_LOCAL_MEM_FENCE);\n"
1393  "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
1394  "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
1395  "if( coord.x >= width) return; \n"
1396  "if( coord.y >= height) return;\n"
1397  "float cp = read_imagef(texp, sampler, coord).x;\n"
1398  "float dx = block[idx + 1] - block[idx - 1];\n"
1399  "float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
1400  "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1401  "write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
1402  "write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device);
1403 }
1404 
1405 void ProgramBagCLN::LoadDisplayShaders()
1406 {
1407  s_unpack = new ProgramCL("main",
1408  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1409  "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1410  " int width, int height) {\n"
1411  "int x = get_global_id(0), y = get_global_id(1); \n"
1412  "if(x >= width || y >= height) return;\n"
1413  "float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1414  "float4 result = (float4) (v, v, v, 1);"
1415  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1416 
1417  s_unpack_grd = new ProgramCL("main",
1418  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1419  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1420  "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1421  " int width, int height) {\n"
1422  "int x = get_global_id(0), y = get_global_id(1); \n"
1423  "if(x >= width || y >= height) return;\n"
1424  "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1425  "float v = 5.0 * v0; float4 result = (float4) (v, v, v, 1);"
1426  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1427 
1428  s_unpack_dog = new ProgramCL("main",
1429  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1430  "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1431  " int width, int height) {\n"
1432  "int x = get_global_id(0), y = get_global_id(1); \n"
1433  "if(x >= width || y >= height) return;\n"
1434  "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1435  "float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
1436  "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1437 }
1438 
1439 ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width)
1440 {
1442  char buffer[10240];
1443  ostrstream out(buffer, 10240);
1444  out << "#define KERNEL_WIDTH " << width << "\n"
1445  << "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
1446  "#define BLOCK_WIDTH 128\n"
1447  "#define BLOCK_HEIGHT 1\n"
1448  "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1449  "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1450  "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1451  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1452  "__kernel void filter_h(__read_only image2d_t input, \n"
1453  " __write_only image2d_t output, int width_, int height_, \n"
1454  " __constant float* weight) {\n"
1455  "__local float data[CACHE_WIDTH]; \n"
1456  "int x = get_global_id(0), y = get_global_id(1);\n"
1457  "#pragma unroll\n"
1458  "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1459  "{\n"
1460  " if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
1461  " {\n"
1462  " int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
1463  " data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
1464  " }\n"
1465  "}\n"
1466  "barrier(CLK_LOCAL_MEM_FENCE); \n"
1467  "if( x > width_ || y > height_) return; \n"
1468  "float result = 0; \n"
1469  "#pragma unroll\n"
1470  "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1471  "{\n"
1472  " result += data[get_local_id(0) + i] * weight[i];\n"
1473  "}\n"
1474  << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1475  return new ProgramCL("filter_h", buffer, _context, _device);
1476 }
1477 
1478 
1479 
1480 ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width)
1481 {
1483  char buffer[10240];
1484  ostrstream out(buffer, 10240);
1485  out << "#define KERNEL_WIDTH " << width << "\n"
1486  << "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
1487  "#define BLOCK_WIDTH 128\n"
1488  "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1489  "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1490  "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1491  "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
1492  " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1493  "__kernel void filter_v(__read_only image2d_t input, \n"
1494  " __write_only image2d_t output, int width_, int height_, \n"
1495  " __constant float* weight) {\n"
1496  "__local float data[CACHE_WIDTH]; \n"
1497  "int x = get_global_id(0), y = get_global_id(1);\n"
1498  "#pragma unroll\n"
1499  "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1500  "{\n"
1501  " if(get_local_id(1) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
1502  " {\n"
1503  " int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
1504  " data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
1505  " }\n"
1506  "}\n"
1507  "barrier(CLK_LOCAL_MEM_FENCE); \n"
1508  "if( x > width_ || y > height_) return; \n"
1509  "float result = 0; \n"
1510  "#pragma unroll\n"
1511  "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1512  "{\n"
1513  " result += data[get_local_id(1) + i] * weight[i];\n"
1514  "}\n"
1515  << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1516 
1517  return new ProgramCL("filter_v", buffer, _context, _device);
1518 }
1519 
1520 FilterCL* ProgramBagCLN::CreateFilter(float kernel[], int width)
1521 {
1522  FilterCL * filter = new FilterCL;
1523  filter->s_shader_h = CreateFilterH(kernel, width);
1524  filter->s_shader_v = CreateFilterV(kernel, width);
1525  filter->_weight = new CLTexImage(_context, _queue);
1526  filter->_weight->InitBufferTex(width, 1, 1);
1527  filter->_weight->CopyFromHost(kernel);
1528  filter->_size = width;
1529  return filter;
1530 }
1531 
1532 
1533 void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
1534 {
1535  cl_kernel kernelh = filter->s_shader_h->_kernel;
1536  cl_kernel kernelv = filter->s_shader_v->_kernel;
1538 
1539  cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
1540  cl_mem weight = (cl_mem) filter->_weight->_clData;
1541  cl_int w_ = w - 1, h_ = h - 1;
1542 
1543 
1544  clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
1545  clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
1546  clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
1547  clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
1548  clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight);
1549 
1550  size_t dim00 = 128, dim01 = 1;
1551  size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01};
1552  status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL);
1553  CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
1554  if(status != CL_SUCCESS) return;
1555 
1556 
1557  clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
1558  clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
1559  clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
1560  clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
1561  clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight);
1562 
1563  size_t dim10 = 1, dim11 = 128;
1564  size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11};
1565  status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
1566  CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
1567  //clReleaseEvent(event);
1568 }
1569 
1570 void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
1571 {
1572  cl_kernel kernel;
1573  cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
1574 
1575  cl_int fullstep = (1 << log_scale);
1576  kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
1577  clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
1578  clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
1579  clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
1580  clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
1581  if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
1582 
1583  size_t dim0 = 128, dim1 = 1;
1584  //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
1585  size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
1586  cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
1587  CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
1588 }
1589 
1590 
1591 #endif
1592 
int width
int size
std::string name
int height
int offset
#define NULL
static float _FilterWidthFactor
Definition: GlobalUtil.h:56
static int _UseSiftGPUEX
Definition: GlobalUtil.h:76
static int _SubpixelLocalization
Definition: GlobalUtil.h:72
static int _octave_min_default
Definition: GlobalUtil.h:78
static int _PreciseBorder
Definition: GlobalUtil.h:75
static int _timingL
Definition: GlobalUtil.h:47
static int _verbose
Definition: GlobalUtil.h:44
static int _MaxFilterWidth
Definition: GlobalUtil.h:55
static int _debug
Definition: GlobalUtil.h:54
static int _DescriptorPPT
Definition: GlobalUtil.h:69
static void StopTimer()
Definition: GlobalUtil.h:127
static void StartTimer(const char *event)
Definition: GlobalUtil.h:128
float GetInitialSmoothSigma(int octave_min)
Definition: SiftGPU.cpp:415
float * _sigma
Definition: SiftGPU.h:75
int _dog_level_num
Definition: SiftGPU.h:85
float _sigma_skip1
Definition: SiftGPU.h:77
int _sigma_num
Definition: SiftGPU.h:82
float _sigma_skip0
Definition: SiftGPU.h:76
ImGuiContext * context
Definition: Window.cpp:76
static void error(char *msg)
Definition: lsd.c:159
QTextStream & endl(QTextStream &stream)
Definition: QtCompat.h:718
MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:89
Definition: Eigen.h:85