1 /**********************************************************************
2 * Software License Agreement (BSD License)
4 * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *************************************************************************/
30 #include "kdtree_cuda_3d_index.h"
31 #include <FLANN/algorithms/dist.h>
32 #include <FLANN/util/cuda/result_set.h>
33 // #define THRUST_DEBUG 1
35 #include <thrust/copy.h>
36 #include <thrust/device_vector.h>
37 #include <vector_types.h>
38 #include <FLANN/util/cutil_math.h>
39 #include <thrust/host_vector.h>
40 #include <thrust/copy.h>
41 #include <FLANN/util/cuda/heap.h>
42 #include <thrust/scan.h>
43 #include <thrust/count.h>
44 #include <FLANN/algorithms/kdtree_cuda_builder.h>
45 #include <vector_types.h>
49 namespace KdTreeCudaPrivate
51 template< typename GPUResultSet, typename Distance >
53 void searchNeighbors(const cuda::kd_tree_builder_detail::SplitInfo* splits,
56 const float4* aabbLow,
57 const float4* aabbHigh, const float4* elements, const float4& q, GPUResultSet& result, const Distance& distance = Distance() )
64 cuda::kd_tree_builder_detail::SplitInfo split;
66 if( current==-1 ) break;
67 split = splits[current];
70 if( split.split_dim==0 ) diff1=q.x- split.split_val;
71 else if( split.split_dim==1 ) diff1=q.y- split.split_val;
72 else if( split.split_dim==2 ) diff1=q.z- split.split_val;
74 // children are next to each other: leftChild+1 == rightChild
75 int leftChild= child1[current];
76 int bestChild=leftChild;
77 int otherChild=leftChild;
87 /* If this is a leaf node, then do check and return. */
89 for (int i=split.left; i<split.right; ++i) {
90 float dist=distance.dist(elements[i],q);
91 result.insert(i,dist);
95 current=parent[current];
97 else { // go to closer child node
102 else { // continue moving back up the tree or visit far node?
103 // minimum possible distance between query point and a point inside the AABB
105 float4 aabbMin=aabbLow[otherChild];
106 float4 aabbMax=aabbHigh[otherChild];
108 if( q.x < aabbMin.x ) mindistsq+=distance.axisDist(q.x, aabbMin.x);
109 else if( q.x > aabbMax.x ) mindistsq+=distance.axisDist(q.x, aabbMax.x);
110 if( q.y < aabbMin.y ) mindistsq+=distance.axisDist(q.y, aabbMin.y);
111 else if( q.y > aabbMax.y ) mindistsq+=distance.axisDist(q.y, aabbMax.y);
112 if( q.z < aabbMin.z ) mindistsq+=distance.axisDist(q.z, aabbMin.z);
113 else if( q.z > aabbMax.z ) mindistsq+=distance.axisDist(q.z, aabbMax.z);
115 // the far node was NOT the last node (== not visited yet) AND there could be a closer point in it
116 if(( lastNode==bestChild) && (mindistsq <= result.worstDist() ) ) {
123 current=parent[current];
131 template< typename GPUResultSet, typename Distance >
133 void nearestKernel(const cuda::kd_tree_builder_detail::SplitInfo* splits,
136 const float4* aabbMin,
137 const float4* aabbMax, const float4* elements, const float* query, int stride, int resultStride, int* resultIndex, float* resultDist, int querysize, GPUResultSet result, Distance dist = Distance())
139 typedef float DistanceType;
140 typedef float ElementType;
141 // typedef DistanceType float;
142 size_t tid = blockDim.x*blockIdx.x + threadIdx.x;
144 if( tid >= querysize ) return;
146 float4 q = make_float4(query[tid*stride],query[tid*stride+1],query[tid*stride+2],0);
148 result.setResultLocation( resultDist, resultIndex, tid, resultStride );
150 searchNeighbors(splits,child1,parent,aabbMin,aabbMax,elements, q, result, dist);
157 //! contains some pointers that use cuda data types and that cannot be easily
158 //! forward-declared.
159 //! basically it contains all GPU buffers
160 template<typename Distance>
161 struct KDTreeCuda3dIndex<Distance>::GpuHelper
163 thrust::device_vector< cuda::kd_tree_builder_detail::SplitInfo >* gpu_splits_;
164 thrust::device_vector< int >* gpu_parent_;
165 thrust::device_vector< int >* gpu_child1_;
166 thrust::device_vector< float4 >* gpu_aabb_min_;
167 thrust::device_vector< float4 >* gpu_aabb_max_;
168 thrust::device_vector<float4>* gpu_points_;
169 thrust::device_vector<int>* gpu_vind_;
170 GpuHelper() : gpu_splits_(0), gpu_parent_(0), gpu_child1_(0), gpu_aabb_min_(0), gpu_aabb_max_(0), gpu_points_(0), gpu_vind_(0){
180 delete gpu_aabb_max_;
182 delete gpu_aabb_min_;
192 //! thrust transform functor
193 //! transforms indices in the internal data set back to the original indices
198 map_indices(const int* v) : v_(v) {
202 float operator() (const int&i) const
204 if( i>= 0 ) return v_[i];
209 //! implementation of L2 distance for the CUDA kernels
215 axisDist( float a, float b )
222 dist( float4 a, float4 b )
225 return dot(diff,diff);
229 //! implementation of L1 distance for the CUDA kernels
236 axisDist( float a, float b )
243 dist( float4 a, float4 b )
245 return fabs(a.x-b.x)+fabs (a.y-b.y)+( a.z-b.z)+(a.w-b.w);
249 //! used to adapt CPU and GPU distance types.
250 //! specializations define the ::type as their corresponding GPU distance type
251 //! \see GpuDistance< L2<float> >, GpuDistance< L2_Simple<float> >
252 template< class Distance >
258 struct GpuDistance< L2<float> >
264 struct GpuDistance< L2_Simple<float> >
269 struct GpuDistance< L1<float> >
275 template< typename Distance >
276 void KDTreeCuda3dIndex<Distance>::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const
278 assert(indices.rows >= queries.rows);
279 assert(dists.rows >= queries.rows);
280 assert(int(indices.cols) >= knn);
281 assert( dists.cols == indices.cols && dists.stride==indices.stride );
283 int istride=queries.stride/sizeof(ElementType);
284 int ostride=indices.stride/4;
286 bool matrices_on_gpu = params.matrices_in_gpu_ram;
288 int threadsPerBlock = 128;
289 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
291 float epsError = 1+params.eps;
292 bool sorted = params.sorted;
293 bool use_heap = params.use_heap;
295 typename GpuDistance<Distance>::type distance;
296 // std::cout<<" search: "<<std::endl;
297 // std::cout<<" rows: "<<indices.rows<<" "<<dists.rows<<" "<<queries.rows<<std::endl;
298 // std::cout<<" cols: "<<indices.cols<<" "<<dists.cols<<" "<<queries.cols<<std::endl;
299 // std::cout<<" stride: "<<indices.stride<<" "<<dists.stride<<" "<<queries.stride<<std::endl;
300 // std::cout<<" stride2:"<<istride<<" "<<ostride<<std::endl;
301 // std::cout<<" knn:"<<knn<<" matrices_on_gpu:"<<matrices_on_gpu<<std::endl;
303 if( !matrices_on_gpu ) {
304 thrust::device_vector<float> queriesDev(istride* queries.rows,0);
305 thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
306 thrust::device_vector<float> distsDev(queries.rows* ostride);
307 thrust::device_vector<int> indicesDev(queries.rows* ostride);
312 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
313 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
314 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
315 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
316 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
317 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
318 thrust::raw_pointer_cast(&queriesDev[0]),
321 thrust::raw_pointer_cast(&indicesDev[0]),
322 thrust::raw_pointer_cast(&distsDev[0]),
323 queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
324 // KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
325 // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
326 // thrust::raw_pointer_cast(&queriesDev[0]),
328 // thrust::raw_pointer_cast(&indicesDev[0]),
329 // thrust::raw_pointer_cast(&distsDev[0]),
330 // queries.rows, epsError);
335 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
336 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
337 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
338 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
339 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
340 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
341 thrust::raw_pointer_cast(&queriesDev[0]),
344 thrust::raw_pointer_cast(&indicesDev[0]),
345 thrust::raw_pointer_cast(&distsDev[0]),
346 queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
350 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
351 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
352 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
353 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
354 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
355 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
356 thrust::raw_pointer_cast(&queriesDev[0]),
359 thrust::raw_pointer_cast(&indicesDev[0]),
360 thrust::raw_pointer_cast(&distsDev[0]),
361 queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
366 thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
367 thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
368 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
371 thrust::device_ptr<float> qd = thrust::device_pointer_cast(queries.ptr());
372 thrust::device_ptr<float> dd = thrust::device_pointer_cast(dists.ptr());
373 thrust::device_ptr<int> id = thrust::device_pointer_cast(indices.ptr());
378 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
379 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
380 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
381 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
382 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
383 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
389 queries.rows, flann::cuda::SingleResultSet<float>(epsError),distance);
390 // KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_nodes_)[0])),
391 // thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
392 // thrust::raw_pointer_cast(&queriesDev[0]),
394 // thrust::raw_pointer_cast(&indicesDev[0]),
395 // thrust::raw_pointer_cast(&distsDev[0]),
396 // queries.rows, epsError);
401 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
402 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
403 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
404 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
405 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
406 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
412 queries.rows, flann::cuda::KnnResultSet<float, true>(knn,sorted,epsError)
416 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
417 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
418 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
419 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
420 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
421 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
427 queries.rows, flann::cuda::KnnResultSet<float, false>(knn,sorted,epsError),
432 thrust::transform(id, id+knn*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
437 template< typename Distance>
438 int KDTreeCuda3dIndex<Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
439 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const
441 // assert(indices.roasdfws >= queries.rows);
442 // assert(dists.rows >= queries.rows);
444 int max_neighbors = params.max_neighbors;
445 bool sorted = params.sorted;
446 bool use_heap = params.use_heap;
447 if (indices.size() < queries.rows ) indices.resize(queries.rows);
448 if (dists.size() < queries.rows ) dists.resize(queries.rows);
450 int istride=queries.stride/sizeof(ElementType);
452 thrust::device_vector<float> queriesDev(istride* queries.rows,0);
453 thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
454 thrust::device_vector<int> countsDev(queries.rows);
456 typename GpuDistance<Distance>::type distance;
458 int threadsPerBlock = 128;
459 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
462 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
463 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
464 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
465 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
466 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
467 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
468 thrust::raw_pointer_cast(&queriesDev[0]),
471 thrust::raw_pointer_cast(&countsDev[0]),
473 queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,max_neighbors),
477 thrust::host_vector<int> counts_host=countsDev;
479 if( max_neighbors!=0 ) { // we'll need this later, but the exclusive_scan will change the array
480 for( size_t i=0; i<queries.rows; i++ ) {
481 int count = counts_host[i];
483 indices[i].resize(count);
484 dists[i].resize(count);
494 int neighbors_last_elem = countsDev.back();
495 thrust::exclusive_scan( countsDev.begin(), countsDev.end(), countsDev.begin() );
497 size_t total_neighbors=neighbors_last_elem+countsDev.back();
498 if( max_neighbors==0 ) return total_neighbors;
500 thrust::device_vector<int> indicesDev(total_neighbors,-1);
501 thrust::device_vector<float> distsDev(total_neighbors,std::numeric_limits<float>::infinity());
503 if( max_neighbors<0 ) {
504 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
505 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
506 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
507 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
508 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
509 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
510 thrust::raw_pointer_cast(&queriesDev[0]),
513 thrust::raw_pointer_cast(&indicesDev[0]),
514 thrust::raw_pointer_cast(&distsDev[0]),
515 queries.rows, flann::cuda::RadiusResultSet<float>(radius,thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
519 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
520 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
521 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
522 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
523 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
524 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
525 thrust::raw_pointer_cast(&queriesDev[0]),
528 thrust::raw_pointer_cast(&indicesDev[0]),
529 thrust::raw_pointer_cast(&distsDev[0]),
530 queries.rows, flann::cuda::RadiusKnnResultSet<float, true>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
533 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
534 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
535 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
536 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
537 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
538 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
539 thrust::raw_pointer_cast(&queriesDev[0]),
542 thrust::raw_pointer_cast(&indicesDev[0]),
543 thrust::raw_pointer_cast(&distsDev[0]),
544 queries.rows, flann::cuda::RadiusKnnResultSet<float, false>(radius,max_neighbors, thrust::raw_pointer_cast(&countsDev[0]),sorted), distance);
547 thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
548 thrust::host_vector<int> indices_temp = indicesDev;
549 thrust::host_vector<float> dists_temp = distsDev;
552 for( size_t i=0; i<queries.rows; i++ ) {
553 for( size_t j=0; j<counts_host[i]; j++ ) {
554 dists[i][j]=dists_temp[buffer_index];
555 indices[i][j]=indices_temp[buffer_index];
563 //! used in the radius search to count the total number of neighbors
567 bool operator() ( int i ){
572 template< typename Distance>
573 int KDTreeCuda3dIndex< Distance >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const
575 int max_neighbors = params.max_neighbors;
576 assert(indices.rows >= queries.rows);
577 assert(dists.rows >= queries.rows || max_neighbors==0 );
578 assert(indices.stride==dists.stride || max_neighbors==0 );
579 assert( indices.cols==indices.stride/sizeof(int) );
580 assert(dists.rows >= queries.rows || max_neighbors==0 );
582 bool sorted = params.sorted;
583 bool matrices_on_gpu = params.matrices_in_gpu_ram;
584 float epsError = 1+params.eps;
585 bool use_heap = params.use_heap;
586 int istride=queries.stride/sizeof(ElementType);
587 int ostride= indices.stride/4;
590 if( max_neighbors<0 ) max_neighbors=indices.cols;
592 if( !matrices_on_gpu ) {
593 thrust::device_vector<float> queriesDev(istride* queries.rows,0);
594 thrust::copy( queries.ptr(), queries.ptr()+istride*queries.rows, queriesDev.begin() );
595 typename GpuDistance<Distance>::type distance;
596 int threadsPerBlock = 128;
597 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
598 if( max_neighbors== 0 ) {
599 thrust::device_vector<int> indicesDev(queries.rows* ostride);
600 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
601 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
602 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
603 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
604 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
605 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
606 thrust::raw_pointer_cast(&queriesDev[0]),
609 thrust::raw_pointer_cast(&indicesDev[0]),
611 queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
614 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
615 return thrust::reduce(indicesDev.begin(), indicesDev.end() );
619 thrust::device_vector<float> distsDev(queries.rows* max_neighbors);
620 thrust::device_vector<int> indicesDev(queries.rows* max_neighbors);
623 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
624 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
625 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
626 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
627 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
628 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
629 thrust::raw_pointer_cast(&queriesDev[0]),
632 thrust::raw_pointer_cast(&indicesDev[0]),
633 thrust::raw_pointer_cast(&distsDev[0]),
634 queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
637 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
638 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
639 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
640 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
641 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
642 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
643 thrust::raw_pointer_cast(&queriesDev[0]),
646 thrust::raw_pointer_cast(&indicesDev[0]),
647 thrust::raw_pointer_cast(&distsDev[0]),
648 queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
651 thrust::copy( distsDev.begin(), distsDev.end(), dists.ptr() );
652 thrust::transform(indicesDev.begin(), indicesDev.end(), indicesDev.begin(), map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
653 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
655 return thrust::count_if(indicesDev.begin(), indicesDev.end(), isNotMinusOne() );
659 thrust::device_ptr<float> qd=thrust::device_pointer_cast(queries.ptr());
660 thrust::device_ptr<float> dd=thrust::device_pointer_cast(dists.ptr());
661 thrust::device_ptr<int> id=thrust::device_pointer_cast(indices.ptr());
662 typename GpuDistance<Distance>::type distance;
663 int threadsPerBlock = 128;
664 int blocksPerGrid=(queries.rows+threadsPerBlock-1)/threadsPerBlock;
666 if( max_neighbors== 0 ) {
667 thrust::device_vector<int> indicesDev(queries.rows* indices.stride);
668 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
669 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
670 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
671 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
672 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
673 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
679 queries.rows, flann::cuda::CountingRadiusResultSet<float>(radius,-1),
682 thrust::copy( indicesDev.begin(), indicesDev.end(), indices.ptr() );
683 return thrust::reduce(indicesDev.begin(), indicesDev.end() );
687 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
688 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
689 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
690 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
691 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
692 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
698 queries.rows, flann::cuda::KnnRadiusResultSet<float, true>(max_neighbors,sorted,epsError, radius), distance);
701 KdTreeCudaPrivate::nearestKernel<<<blocksPerGrid, threadsPerBlock>>> (thrust::raw_pointer_cast(&((*gpu_helper_->gpu_splits_)[0])),
702 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_child1_)[0])),
703 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_parent_)[0])),
704 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_min_)[0])),
705 thrust::raw_pointer_cast(&((*gpu_helper_->gpu_aabb_max_)[0])),
706 thrust::raw_pointer_cast( &((*gpu_helper_->gpu_points_)[0]) ),
712 queries.rows, flann::cuda::KnnRadiusResultSet<float, false>(max_neighbors,sorted,epsError, radius), distance);
715 thrust::transform(id, id+max_neighbors*queries.rows, id, map_indices(thrust::raw_pointer_cast( &((*gpu_helper_->gpu_vind_))[0]) ));
717 return thrust::count_if(id, id+max_neighbors*queries.rows, isNotMinusOne() );
721 template<typename Distance>
722 void KDTreeCuda3dIndex<Distance>::uploadTreeToGpu()
724 // just make sure that no weird alignment stuff is going on...
725 // shouldn't, but who knows
726 // (I would make this a (boost) static assertion, but so far flann seems to avoid boost
727 // assert( sizeof( KdTreeCudaPrivate::GpuNode)==sizeof( Node ) );
729 gpu_helper_ = new GpuHelper;
730 gpu_helper_->gpu_points_=new thrust::device_vector<float4>(size_);
731 thrust::device_vector<float4> tmp(size_);
732 if( get_param(index_params_,"input_is_gpu_float4",false) ) {
733 assert( dataset_.cols == 3 && dataset_.stride==4*sizeof(float));
734 thrust::copy( thrust::device_pointer_cast((float4*)dataset_.ptr()),thrust::device_pointer_cast((float4*)(dataset_.ptr()))+size_,tmp.begin());
738 // k is limited to 4 -> use 128bit-alignment regardless of dimensionality
739 // makes cpu search about 5% slower, but gpu can read a float4 w/ a single instruction
740 // (vs a float2 and a float load for a float3 value)
741 // pad data directly to avoid having to copy and re-format the data when
742 // copying it to the GPU
743 data_ = flann::Matrix<ElementType>(new ElementType[size_*4], size_, dim_,4*4);
744 for (size_t i=0; i<size_; ++i) {
745 for (size_t j=0; j<dim_; ++j) {
746 data_[i][j] = dataset_[i][j];
748 for (size_t j=dim_; j<4; ++j) {
752 thrust::copy((float4*)data_.ptr(),(float4*)(data_.ptr())+size_,tmp.begin());
755 CudaKdTreeBuilder builder( tmp, leaf_max_size_ );
758 gpu_helper_->gpu_splits_ = builder.splits_;
759 gpu_helper_->gpu_aabb_min_ = builder.aabb_min_;
760 gpu_helper_->gpu_aabb_max_ = builder.aabb_max_;
761 gpu_helper_->gpu_child1_ = builder.child1_;
762 gpu_helper_->gpu_parent_=builder.parent_;
763 gpu_helper_->gpu_vind_=builder.index_x_;
764 thrust::gather( builder.index_x_->begin(), builder.index_x_->end(), tmp.begin(), gpu_helper_->gpu_points_->begin());
766 // gpu_helper_->gpu_nodes_=new thrust::device_vector<KdTreeCudaPrivate::GpuNode>(node_count_);
769 // gpu_helper_->gpu_vind_=new thrust::device_vector<int>(size_);
770 // thrust::copy( (KdTreeCudaPrivate::GpuNode*)&(tree_[0]), ((KdTreeCudaPrivate::GpuNode*)&(tree_[0]))+tree_.size(), gpu_helper_->gpu_nodes_->begin());
772 // thrust::copy(vind_.begin(),vind_.end(),gpu_helper_->gpu_vind_->begin());
778 template<typename Distance>
779 void KDTreeCuda3dIndex<Distance>::clearGpuBuffers()
785 // explicit instantiations for distance-independent functions
787 void KDTreeCuda3dIndex<flann::L2<float> >::uploadTreeToGpu();
790 void KDTreeCuda3dIndex<flann::L2<float> >::clearGpuBuffers();
793 struct KDTreeCuda3dIndex<flann::L2<float> >::GpuHelper;
796 void KDTreeCuda3dIndex<flann::L2<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
799 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
801 int KDTreeCuda3dIndex< flann::L2<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
802 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
804 // explicit instantiations for distance-independent functions
806 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::uploadTreeToGpu();
809 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::clearGpuBuffers();
812 struct KDTreeCuda3dIndex<flann::L2_Simple<float> >::GpuHelper;
815 void KDTreeCuda3dIndex<flann::L2_Simple<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
818 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
820 int KDTreeCuda3dIndex< flann::L2_Simple<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
821 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;
824 // explicit instantiations for distance-independent functions
826 void KDTreeCuda3dIndex<flann::L1<float> >::uploadTreeToGpu();
829 void KDTreeCuda3dIndex<flann::L1<float> >::clearGpuBuffers();
832 struct KDTreeCuda3dIndex<flann::L1<float> >::GpuHelper;
835 void KDTreeCuda3dIndex<flann::L1<float> >::knnSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, size_t knn, const SearchParams& params) const;
838 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, float radius, const SearchParams& params) const;
840 int KDTreeCuda3dIndex< flann::L1<float> >::radiusSearchGpu(const Matrix<ElementType>& queries, std::vector< std::vector<int> >& indices,
841 std::vector<std::vector<DistanceType> >& dists, float radius, const SearchParams& params) const;