ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
kdtree_cuda_3d_index.cu
Go to the documentation of this file.
1 /**********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
5  *
6  * THE BSD LICENSE
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *************************************************************************/
29 
30 #include "kdtree_cuda_3d_index.h"
31 #include <FLANN/algorithms/dist.h>
32 #include <FLANN/util/cuda/result_set.h>
33 // #define THRUST_DEBUG 1
34 #include <cuda.h>
35 #include <thrust/copy.h>
36 #include <thrust/device_vector.h>
37 #include <vector_types.h>
38 #include <FLANN/util/cutil_math.h>
39 #include <thrust/host_vector.h>
40 #include <thrust/copy.h>
41 #include <FLANN/util/cuda/heap.h>
42 #include <thrust/scan.h>
43 #include <thrust/count.h>
44 #include <FLANN/algorithms/kdtree_cuda_builder.h>
45 #include <vector_types.h>
46 namespace flann
47 {
48 
49 namespace KdTreeCudaPrivate
50 {
51 template< typename GPUResultSet, typename Distance >
52 __device__
53 void searchNeighbors(const cuda::kd_tree_builder_detail::SplitInfo* splits,
54  const int* child1,
55  const int* parent,
56  const float4* aabbLow,
57  const float4* aabbHigh, const float4* elements, const float4& q, GPUResultSet& result, const Distance& distance = Distance() )
58 {
59 
60  bool backtrack=false;
61  int lastNode=-1;
62  int current=0;
63 
64  cuda::kd_tree_builder_detail::SplitInfo split;
65  while(true) {
66  if( current==-1 ) break;
67  split = splits[current];
68 
69  float diff1;
70  if( split.split_dim==0 ) diff1=q.x- split.split_val;
71  else if( split.split_dim==1 ) diff1=q.y- split.split_val;
72  else if( split.split_dim==2 ) diff1=q.z- split.split_val;
73 
74  // children are next to each other: leftChild+1 == rightChild
75  int leftChild= child1[current];
76  int bestChild=leftChild;
77  int otherChild=leftChild;
78 
79  if ((diff1)<0) {
80  otherChild++;
81  }
82  else {
83  bestChild++;
84  }
85 
86  if( !backtrack ) {
87  /* If this is a leaf node, then do check and return. */
88  if (leftChild==-1) {
89  for (int i=split.left; i<split.right; ++i) {
90  float dist=distance.dist(elements[i],q);
91  result.insert(i,dist);
92  }
93  backtrack=true;
94  lastNode=current;
95  current=parent[current];
96  }
97  else { // go to closer child node
98  lastNode=current;
99  current=bestChild;
100  }
101  }
102  else { // continue moving back up the tree or visit far node?
103  // minimum possible distance between query point and a point inside the AABB
104  float mindistsq=0;
105  float4 aabbMin=aabbLow[otherChild];
106  float4 aabbMax=aabbHigh[otherChild];
107 
108  if( q.x < aabbMin.x ) mindistsq+=distance.axisDist(q.x, aabbMin.x);
109  else if( q.x > aabbMax.x ) mindistsq+=distance.axisDist(q.x, aabbMax.x);
110  if( q.y < aabbMin.y ) mindistsq+=distance.axisDist(q.y, aabbMin.y);
111  else if( q.y > aabbMax.y ) mindistsq+=distance.axisDist(q.y, aabbMax.y);
112  if( q.z < aabbMin.z ) mindistsq+=distance.axisDist(q.z, aabbMin.z);
113  else if( q.z > aabbMax.z ) mindistsq+=distance.axisDist(q.z, aabbMax.z);
114 
115  // the far node was NOT the last node (== not visited yet) AND there could be a closer point in it
116  if(( lastNode==bestChild) && (mindistsq <= result.worstDist() ) ) {
117  lastNode=current;
118  current=otherChild;
119  backtrack=false;
120  }
121  else {
122  lastNode=current;
123  current=parent[current];
124  }
125  }
126 
127  }
128 }
129 
130 
131 template< typename GPUResultSet, typename Distance >
132 __global__
133 void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits,
134  const int* child1,
135  const int* parent,
136  const float4* aabbMin,
137  const float4* aabbMax, const float4* elements, const float* query, int stride, int resultStride, int* resultIndex, float* resultDist, int querysize, GPUResultSet result, Distance dist = Distance())
138 {
139  typedef float DistanceType;
140  typedef float ElementType;
141  // typedef DistanceType float;
142  size_t tid = blockDim.x*blockIdx.x + threadIdx.x;
143 
144  if( tid >= querysize ) return;
145 
146  float4 q = make_float4(query[tid*stride],query[tid*stride+1],query[tid*stride+2],0);
147 
148  result.setResultLocation( resultDist, resultIndex, tid, resultStride );
149 
150  searchNeighbors(splits,child1,parent,aabbMin,aabbMax,elements, q, result, dist);
151 
152  result.finish();
153 }
154 
155 }
156 
157 //! contains some pointers that use cuda data types and that cannot be easily
158 //! forward-declared.
159 //! basically it contains all GPU buffers
160 template<typename Distance>
161 struct KDTreeCuda3dIndex<Distance>::GpuHelper
162 {
163  thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
164  thrust::device_vector< int >* gpu_parent_;
165  thrust::device_vector< int >* gpu_child1_;
166  thrust::device_vector< float4 >* gpu_aabb_min_;
167  thrust::device_vector< float4 >* gpu_aabb_max_;
168  thrust::device_vector<float4>* gpu_points_;
169  thrust::device_vector<int>* gpu_vind_;
170  GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){
171  }
172  ~GpuHelper()
173  {
174  delete gpu_splits_;
175  gpu_splits_=0;
176  delete gpu_parent_;
177  gpu_parent_=0;
178  delete gpu_child1_;
179  gpu_child1_=0;
180  delete gpu_aabb_max_;
181  gpu_aabb_max_=0;
182  delete gpu_aabb_min_;
183  gpu_aabb_min_=0;
184  delete gpu_vind_;
185  gpu_vind_=0;
186 
187  delete gpu_points_;
188  gpu_points_=0;
189  }
190 };
191 
192 //! thrust transform functor
193 //! transforms indices in the internal data set back to the original indices
194 struct map_indices
195 {
196  const int* v_;
197 
198  map_indices(const int* v) : v_(v) {
199  }
200 
201  __host__ __device__
202  float operator() (const int&i) const
203  {
204  if( i>= 0 ) return v_[i];
205  else return i;
206  }
207 };
208 
209 //! implementation of L2 distance for the CUDA kernels
210 struct CudaL2
211 {
212 
213  static float
214  __host__ __device__
215  axisDist( float a, float b )
216  {
217  return (a-b)*(a-b);
218  }
219 
220  static float
221  __host__ __device__
222  dist( float4 a, float4 b )
223  {
224  float4 diff = a-b;
225  return dot(diff,diff);
226  }
227 };
228 
229 //! implementation of L1 distance for the CUDA kernels
230 //! NOT TESTED!
231 struct CudaL1
232 {
233 
234  static float
235  __host__ __device__
236  axisDist( float a, float b )
237  {
238  return fabs(a-b);
239  }
240 
241  static float
242  __host__ __device__
243  dist( float4 a, float4 b )
244  {
245  return fabs(a.x-b.x)+fabs (a.y-b.y)+( a.z-b.z)+(a.w-b.w);
246  }
247 };
248 
249 //! used to adapt CPU and GPU distance types.
250 //! specializations define the ::type as their corresponding GPU distance type
251 //! \see GpuDistance< L2<float> >, GpuDistance< L2_Simple<float> >
252 template< class Distance >
253 struct GpuDistance
254 {
255 };
256 
257 template<>
258 struct GpuDistance< L2<float> >
259 {
260  typedef CudaL2 type;
261 };
262 
263 template<>
264 struct GpuDistance< L2_Simple<float> >
265 {
266  typedef CudaL2 type;
267 };
268 template<>
269 struct GpuDistance< L1<float> >
270 {
271  typedef CudaL1 type;
272 };
273 
274 
275 template< typename Distance >
276 void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const
277 {
278  assert(indices.rows >= queries.rows);
279  assert(dists.rows >= queries.rows);
280  assert(int(indices.cols) >= knn);
281  assert( dists.cols == indices.cols && dists.stride==indices.stride );
282 
283  int istride=queries.stride/sizeof(ElementType);
284  int ostride=indices.stride/4;
285 
286  bool matrices_on_gpu = params.matrices_in_gpu_ram;
287 
288  int threadsPerBlock = 128;
289  int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
290 
291  float epsError = 1+params.eps;
292  bool sorted = params.sorted;
293  bool use_heap = params.use_heap;
294 
295  typename GpuDistance<Distance>::type distance;
296 // std::cout<<" search: "<<std::endl;
297 // std::cout<<" rows: "<<indices.rows<<" "<<dists.rows<<" "<<queries.rows<<std::endl;
298 // std::cout<<" cols: "<<indices.cols<<" "<<dists.cols<<" "<<queries.cols<<std::endl;
299 // std::cout<<" stride: "<<indices.stride<<" "<<dists.stride<<" "<<queries.stride<<std::endl;
300 // std::cout<<" stride2:"<<istride<<" "<<ostride<<std::endl;
301 // std::cout<<" knn:"<<knn<<" matrices_on_gpu:"<<matrices_on_gpu<<std::endl;
302 
303  if( !matrices_on_gpu ) {
304  thrust::device_vector<float> queriesDev(istride* queries.rows,0);
305  thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
306  thrust::device_vector<float> distsDev(queries.rows* ostride);
307  thrust::device_vector<int> indicesDev(queries.rows* ostride);
308 
309 
310 
311  if( knn==1 ) {
312  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
313  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
314  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
315  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
316  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
317  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
318  thrust::raw_pointer_cast(&queriesDev[0]),
319  istride,
320  ostride,
321  thrust::raw_pointer_cast(&indicesDev[0]),
322  thrust::raw_pointer_cast(&distsDev[0]),
323  queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
324  // KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
325  // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
326  // thrust::raw_pointer_cast(&queriesDev[0]),
327  // queries.stride,
328  // thrust::raw_pointer_cast(&indicesDev[0]),
329  // thrust::raw_pointer_cast(&distsDev[0]),
330  // queries.rows, epsError);
331  //
332  }
333  else {
334  if( use_heap ) {
335  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
336  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
337  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
338  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
339  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
340  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
341  thrust::raw_pointer_cast(&queriesDev[0]),
342  istride,
343  ostride,
344  thrust::raw_pointer_cast(&indicesDev[0]),
345  thrust::raw_pointer_cast(&distsDev[0]),
346  queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
347  , distance);
348  }
349  else {
350  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
351  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
352  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
353  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
354  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
355  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
356  thrust::raw_pointer_cast(&queriesDev[0]),
357  istride,
358  ostride,
359  thrust::raw_pointer_cast(&indicesDev[0]),
360  thrust::raw_pointer_cast(&distsDev[0]),
361  queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
362  distance
363  );
364  }
365  }
366  thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
367  thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
368  thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
369  }
370  else {
371  thrust::device_ptr<float> qd = thrust::device_pointer_cast(queries.ptr());
372  thrust::device_ptr<float> dd = thrust::device_pointer_cast(dists.ptr());
373  thrust::device_ptr<int> id = thrust::device_pointer_cast(indices.ptr());
374 
375 
376 
377  if( knn==1 ) {
378  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
379  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
380  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
381  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
382  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
383  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
384  qd.get(),
385  istride,
386  ostride,
387  id.get(),
388  dd.get(),
389  queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
390  // KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
391  // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
392  // thrust::raw_pointer_cast(&queriesDev[0]),
393  // queries.stride,
394  // thrust::raw_pointer_cast(&indicesDev[0]),
395  // thrust::raw_pointer_cast(&distsDev[0]),
396  // queries.rows, epsError);
397  //
398  }
399  else {
400  if( use_heap ) {
401  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
402  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
403  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
404  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
405  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
406  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
407  qd.get(),
408  istride,
409  ostride,
410  id.get(),
411  dd.get(),
412  queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
413  , distance);
414  }
415  else {
416  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
417  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
418  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
419  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
420  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
421  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
422  qd.get(),
423  istride,
424  ostride,
425  id.get(),
426  dd.get(),
427  queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
428  distance
429  );
430  }
431  }
432  thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
433  }
434 }
435 
436 
437 template< typename Distance>
438 int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
439  std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const
440 {
441  // assert(indices.roasdfws >= queries.rows);
442  // assert(dists.rows >= queries.rows);
443 
444  int max_neighbors = params.max_neighbors;
445  bool sorted = params.sorted;
446  bool use_heap = params.use_heap;
447  if (indices.size() < queries.rows ) indices.resize(queries.rows);
448  if (dists.size() < queries.rows ) dists.resize(queries.rows);
449 
450  int istride=queries.stride/sizeof(ElementType);
451 
452  thrust::device_vector<float> queriesDev(istride* queries.rows,0);
453  thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
454  thrust::device_vector<int> countsDev(queries.rows);
455 
456  typename GpuDistance<Distance>::type distance;
457 
458  int threadsPerBlock = 128;
459  int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
460 
461 
462  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
463  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
464  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
465  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
466  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
467  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
468  thrust::raw_pointer_cast(&queriesDev[0]),
469  istride,
470  1,
471  thrust::raw_pointer_cast(&countsDev[0]),
472  0,
473  queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,max_neighbors),
474  distance
475  );
476 
477  thrust::host_vector<int> counts_host=countsDev;
478 
479  if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array
480  for( size_t i=0; i<queries.rows; i++ ) {
481  int count = counts_host[i];
482  if( count > 0 ) {
483  indices[i].resize(count);
484  dists[i].resize(count);
485  }
486  else {
487  indices[i].clear();
488  dists[i].clear();
489  }
490 
491  }
492  }
493 
494  int neighbors_last_elem = countsDev.back();
495  thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() );
496 
497  size_t total_neighbors=neighbors_last_elem+countsDev.back();
498  if( max_neighbors==0 ) return total_neighbors;
499 
500  thrust::device_vector<int> indicesDev(total_neighbors,-1);
501  thrust::device_vector<float> distsDev(total_neighbors,std::numeric_limits<float>::infinity());
502 
503  if( max_neighbors<0 ) {
504  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
505  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
506  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
507  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
508  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
509  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
510  thrust::raw_pointer_cast(&queriesDev[0]),
511  istride,
512  1,
513  thrust::raw_pointer_cast(&indicesDev[0]),
514  thrust::raw_pointer_cast(&distsDev[0]),
515  queries.rows, flann::cuda::RadiusResultSet<float>(radius,thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
516  }
517  else {
518  if( use_heap ) {
519  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
520  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
521  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
522  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
523  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
524  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
525  thrust::raw_pointer_cast(&queriesDev[0]),
526  istride,
527  1,
528  thrust::raw_pointer_cast(&indicesDev[0]),
529  thrust::raw_pointer_cast(&distsDev[0]),
530  queries.rows, flann::cuda::RadiusKnnResultSet<float, true>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
531  }
532  else {
533  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
534  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
535  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
536  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
537  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
538  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
539  thrust::raw_pointer_cast(&queriesDev[0]),
540  istride,
541  1,
542  thrust::raw_pointer_cast(&indicesDev[0]),
543  thrust::raw_pointer_cast(&distsDev[0]),
544  queries.rows, flann::cuda::RadiusKnnResultSet<float, false>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
545  }
546  }
547  thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
548  thrust::host_vector<int> indices_temp = indicesDev;
549  thrust::host_vector<float> dists_temp = distsDev;
550 
551  int buffer_index=0;
552  for( size_t i=0; i<queries.rows; i++ ) {
553  for( size_t j=0; j<counts_host[i]; j++ ) {
554  dists[i][j]=dists_temp[buffer_index];
555  indices[i][j]=indices_temp[buffer_index];
556  ++buffer_index;
557  }
558  }
559 
560  return buffer_index;
561 }
562 
563 //! used in the radius search to count the total number of neighbors
564 struct isNotMinusOne
565 {
566  __host__ __device__
567  bool operator() ( int i ){
568  return i!=-1;
569  }
570 };
571 
572 template< typename Distance>
573 int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const
574 {
575  int max_neighbors = params.max_neighbors;
576  assert(indices.rows >= queries.rows);
577  assert(dists.rows >= queries.rows || max_neighbors==0 );
578  assert(indices.stride==dists.stride || max_neighbors==0 );
579  assert( indices.cols==indices.stride/sizeof(int) );
580  assert(dists.rows >= queries.rows || max_neighbors==0 );
581 
582  bool sorted = params.sorted;
583  bool matrices_on_gpu = params.matrices_in_gpu_ram;
584  float epsError = 1+params.eps;
585  bool use_heap = params.use_heap;
586  int istride=queries.stride/sizeof(ElementType);
587  int ostride= indices.stride/4;
588 
589 
590  if( max_neighbors<0 ) max_neighbors=indices.cols;
591 
592  if( !matrices_on_gpu ) {
593  thrust::device_vector<float> queriesDev(istride* queries.rows,0);
594  thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
595  typename GpuDistance<Distance>::type distance;
596  int threadsPerBlock = 128;
597  int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
598  if( max_neighbors== 0 ) {
599  thrust::device_vector<int> indicesDev(queries.rows* ostride);
600  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
601  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
602  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
603  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
604  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
605  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
606  thrust::raw_pointer_cast(&queriesDev[0]),
607  istride,
608  ostride,
609  thrust::raw_pointer_cast(&indicesDev[0]),
610  0,
611  queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
612  distance
613  );
614  thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
615  return thrust::reduce(indicesDev.begin(), indicesDev.end() );
616  }
617 
618 
619  thrust::device_vector<float> distsDev(queries.rows* max_neighbors);
620  thrust::device_vector<int> indicesDev(queries.rows* max_neighbors);
621 
622  if( use_heap ) {
623  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
624  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
625  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
626  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
627  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
628  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
629  thrust::raw_pointer_cast(&queriesDev[0]),
630  istride,
631  ostride,
632  thrust::raw_pointer_cast(&indicesDev[0]),
633  thrust::raw_pointer_cast(&distsDev[0]),
634  queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
635  }
636  else {
637  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
638  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
639  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
640  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
641  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
642  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
643  thrust::raw_pointer_cast(&queriesDev[0]),
644  istride,
645  ostride,
646  thrust::raw_pointer_cast(&indicesDev[0]),
647  thrust::raw_pointer_cast(&distsDev[0]),
648  queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
649  }
650 
651  thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
652  thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
653  thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
654 
655  return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() );
656  }
657  else {
658 
659  thrust::device_ptr<float> qd=thrust::device_pointer_cast(queries.ptr());
660  thrust::device_ptr<float> dd=thrust::device_pointer_cast(dists.ptr());
661  thrust::device_ptr<int> id=thrust::device_pointer_cast(indices.ptr());
662  typename GpuDistance<Distance>::type distance;
663  int threadsPerBlock = 128;
664  int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
665 
666  if( max_neighbors== 0 ) {
667  thrust::device_vector<int> indicesDev(queries.rows* indices.stride);
668  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
669  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
670  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
671  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
672  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
673  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
674  qd.get(),
675  istride,
676  ostride,
677  id.get(),
678  0,
679  queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
680  distance
681  );
682  thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
683  return thrust::reduce(indicesDev.begin(), indicesDev.end() );
684  }
685 
686  if( use_heap ) {
687  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
688  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
689  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
690  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
691  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
692  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
693  qd.get(),
694  istride,
695  ostride,
696  id.get(),
697  dd.get(),
698  queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
699  }
700  else {
701  KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
702  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
703  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
704  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
705  thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
706  thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
707  qd.get(),
708  istride,
709  ostride,
710  id.get(),
711  dd.get(),
712  queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
713  }
714 
715  thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
716 
717  return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() );
718  }
719 }
720 
721 template<typename Distance>
722 void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
723 {
724  // just make sure that no weird alignment stuff is going on...
725  // shouldn't, but who knows
726  // (I would make this a (boost) static assertion, but so far flann seems to avoid boost
727  // assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) );
728  delete gpu_helper_;
729  gpu_helper_ = new GpuHelper;
730  gpu_helper_->gpu_points_=new thrust::device_vector<float4>(size_);
731  thrust::device_vector<float4> tmp(size_);
732  if( get_param(index_params_,"input_is_gpu_float4",false) ) {
733  assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float));
734  thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin());
735 
736  }
737  else {
738  // k is limited to 4 -> use 128bit-alignment regardless of dimensionality
739  // makes cpu search about 5% slower, but gpu can read a float4 w/ a single instruction
740  // (vs a float2 and a float load for a float3 value)
741  // pad data directly to avoid having to copy and re-format the data when
742  // copying it to the GPU
743  data_ = flann::Matrix<ElementType>(new ElementType[size_*4], size_, dim_,4*4);
744  for (size_t i=0; i<size_; ++i) {
745  for (size_t j=0; j<dim_; ++j) {
746  data_[i][j] = dataset_[i][j];
747  }
748  for (size_t j=dim_; j<4; ++j) {
749  data_[i][j] = 0;
750  }
751  }
752  thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin());
753  }
754 
755  CudaKdTreeBuilder builder( tmp, leaf_max_size_ );
756  builder.buildTree();
757 
758  gpu_helper_->gpu_splits_ = builder.splits_;
759  gpu_helper_->gpu_aabb_min_ = builder.aabb_min_;
760  gpu_helper_->gpu_aabb_max_ = builder.aabb_max_;
761  gpu_helper_->gpu_child1_ = builder.child1_;
762  gpu_helper_->gpu_parent_=builder.parent_;
763  gpu_helper_->gpu_vind_=builder.index_x_;
764  thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin());
765 
766  // gpu_helper_->gpu_nodes_=new thrust::device_vector<KdTreeCudaPrivate::GpuNode>(node_count_);
767 
768 
769  // gpu_helper_->gpu_vind_=new thrust::device_vector<int>(size_);
770  // thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(), gpu_helper_->gpu_nodes_->begin());
771 
772  // thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin());
773 
774  // buildGpuTree();
775 }
776 
777 
778 template<typename Distance>
779 void KDTreeCuda3dIndex<Distance>::clearGpuBuffers()
780 {
781  delete gpu_helper_;
782  gpu_helper_=0;
783 }
784 
785 // explicit instantiations for distance-independent functions
786 template
787 void KDTreeCuda3dIndex<flann::L2<float> >::uploadTreeToGpu();
788 
789 template
790 void KDTreeCuda3dIndex<flann::L2<float> >::clearGpuBuffers();
791 
792 template
793 struct KDTreeCuda3dIndex<flann::L2<float> >::GpuHelper;
794 
795 template
796 void KDTreeCuda3dIndex<flann::L2<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
797 
798 template
799 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
800 template
801 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
802  std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
803 
804 // explicit instantiations for distance-independent functions
805 template
806 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::uploadTreeToGpu();
807 
808 template
809 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::clearGpuBuffers();
810 
811 template
812 struct KDTreeCuda3dIndex<flann::L2_Simple<float> >::GpuHelper;
813 
814 template
815 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
816 
817 template
818 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
819 template
820 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
821  std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
822 
823 
824 // explicit instantiations for distance-independent functions
825 template
826 void KDTreeCuda3dIndex<flann::L1<float> >::uploadTreeToGpu();
827 
828 template
829 void KDTreeCuda3dIndex<flann::L1<float> >::clearGpuBuffers();
830 
831 template
832 struct KDTreeCuda3dIndex<flann::L1<float> >::GpuHelper;
833 
834 template
835 void KDTreeCuda3dIndex<flann::L1<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
836 
837 template
838 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
839 template
840 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
841  std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
842 }