ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
RoiPoolKernel.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include <math.h>
9 #include <stdio.h>
10 
11 #include "cloudViewer/core/CUDAUtils.h"
12 #include "ml/contrib/RoiPoolKernel.h"
13 
14 namespace cloudViewer {
15 namespace ml {
16 namespace contrib {
17 
18 #define THREADS_PER_BLOCK 256
19 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
20 
21 __device__ inline int pt_in_box3d(float x,
22  float y,
23  float z,
24  float cx,
25  float bottom_y,
26  float cz,
27  float h,
28  float w,
29  float l,
30  float angle,
31  float max_dis) {
32  float x_rot, z_rot, cosa, sina, cy;
33  int in_flag;
34  cy = bottom_y - h / 2.0;
35  if ((fabsf(x - cx) > max_dis) || (fabsf(y - cy) > h / 2.0) ||
36  (fabsf(z - cz) > max_dis)) {
37  return 0;
38  }
39  cosa = cos(angle);
40  sina = sin(angle);
41  x_rot = (x - cx) * cosa + (z - cz) * (-sina);
42  z_rot = (x - cx) * sina + (z - cz) * cosa;
43 
44  in_flag = (x_rot >= -l / 2.0) & (x_rot <= l / 2.0) & (z_rot >= -w / 2.0) &
45  (z_rot <= w / 2.0);
46  return in_flag;
47 }
48 
49 __global__ void roipool3d_forward(int batch_size,
50  int pts_num,
51  int boxes_num,
52  int feature_in_len,
53  int sampled_pts_num,
54  const float *xyz,
55  const float *boxes3d,
56  const float *pts_feature,
57  float *pooled_features,
58  int *pooled_empty_flag) {
59  // params xyz: (B, N, 3)
60  // params boxes3d: (B, M, 7)
61  // params pts_feature: (B, N, C)
62  // params pooled_features: (B, M, 512, 3+C)
63  // params pooled_empty_flag: (B, M)
64 
65  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
66  if (boxes_idx >= boxes_num) {
67  return;
68  }
69 
70  for (int i = 0; i < batch_size; i++) {
71  int cnt = 0;
72  for (int k = 0; k < pts_num; k++) {
73  int pt_offset = i * pts_num * 3 + k * 3;
74  int box_offset = i * boxes_num * 7 + boxes_idx * 7;
75 
76  int cur_in_flag = pt_in_box3d(
77  xyz[pt_offset], xyz[pt_offset + 1], xyz[pt_offset + 2],
78  boxes3d[box_offset], boxes3d[box_offset + 1],
79  boxes3d[box_offset + 2], boxes3d[box_offset + 3],
80  boxes3d[box_offset + 4], boxes3d[box_offset + 5],
81  boxes3d[box_offset + 6], 10.0);
82  if (cur_in_flag) {
83  if (cnt < sampled_pts_num) {
84  int feature_out_offset =
85  i * boxes_num * sampled_pts_num *
86  (3 + feature_in_len) +
87  boxes_idx * sampled_pts_num * (3 + feature_in_len) +
88  cnt * (3 + feature_in_len);
89 
90  int feature_in_offset =
91  i * pts_num * feature_in_len + k * feature_in_len;
92 
93  // copy xyz
94  for (int j = 0; j < 3; j++)
95  pooled_features[feature_out_offset + j] =
96  xyz[pt_offset + j];
97 
98  // copy feature
99  for (int j = 0; j < feature_in_len; j++)
100  pooled_features[feature_out_offset + 3 + j] =
101  pts_feature[feature_in_offset + j];
102 
103  cnt++;
104  } else
105  break;
106  }
107  }
108 
109  if (cnt == 0) {
110  pooled_empty_flag[i * boxes_num + boxes_idx] = 1;
111  } else if (cnt < sampled_pts_num) {
112  // duplicate same points for sampling
113  for (int k = cnt; k < sampled_pts_num; k++) {
114  int duplicate_idx = k % cnt;
115  int src_offset =
116  i * boxes_num * sampled_pts_num * (3 + feature_in_len) +
117  boxes_idx * sampled_pts_num * (3 + feature_in_len) +
118  duplicate_idx * (3 + feature_in_len);
119  int dst_offset =
120  i * boxes_num * sampled_pts_num * (3 + feature_in_len) +
121  boxes_idx * sampled_pts_num * (3 + feature_in_len) +
122  k * (3 + feature_in_len);
123  for (int j = 0; j < 3 + feature_in_len; j++)
124  pooled_features[dst_offset + j] =
125  pooled_features[src_offset + j];
126  }
127  }
128  }
129 }
130 
131 __global__ void assign_pts_to_box3d(int batch_size,
132  int pts_num,
133  int boxes_num,
134  const float *xyz,
135  const float *boxes3d,
136  int *pts_assign) {
137  // params xyz: (B, N, 3)
138  // params boxes3d: (B, M, 7)
139  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
140  // background points
141  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
142  int box_idx = blockIdx.y;
143  int bs_idx = blockIdx.z;
144 
145  if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
146  return;
147  }
148  int assign_idx =
149  bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
150  pts_assign[assign_idx] = 0;
151 
152  int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
153  int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
154 
155  int cur_in_flag =
156  pt_in_box3d(xyz[pt_offset], xyz[pt_offset + 1], xyz[pt_offset + 2],
157  boxes3d[box_offset], boxes3d[box_offset + 1],
158  boxes3d[box_offset + 2], boxes3d[box_offset + 3],
159  boxes3d[box_offset + 4], boxes3d[box_offset + 5],
160  boxes3d[box_offset + 6], 10.0);
161 
162  pts_assign[assign_idx] = cur_in_flag;
163  // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx *
164  // pts_num + pt_idx]);
165 }
166 
167 __global__ void get_pooled_idx(int batch_size,
168  int pts_num,
169  int boxes_num,
170  int sampled_pts_num,
171  const int *pts_assign,
172  int *pts_idx,
173  int *pooled_empty_flag) {
174  // params xyz: (B, N, 3)
175  // params pts_feature: (B, N, C)
176  // params pts_assign: (B, N)
177  // params pts_idx: (B, M, 512)
178  // params pooled_empty_flag: (B, M)
179 
180  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
181  if (boxes_idx >= boxes_num) {
182  return;
183  }
184 
185  int bs_idx = blockIdx.y;
186 
187  int cnt = 0;
188  for (int k = 0; k < pts_num; k++) {
189  if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
190  boxes_idx]) {
191  if (cnt < sampled_pts_num) {
192  pts_idx[bs_idx * boxes_num * sampled_pts_num +
193  boxes_idx * sampled_pts_num + cnt] = k;
194  cnt++;
195  } else
196  break;
197  }
198  }
199 
200  if (cnt == 0) {
201  pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
202  } else if (cnt < sampled_pts_num) {
203  // duplicate same points for sampling
204  for (int k = cnt; k < sampled_pts_num; k++) {
205  int duplicate_idx = k % cnt;
206  int base_offset = bs_idx * boxes_num * sampled_pts_num +
207  boxes_idx * sampled_pts_num;
208  pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
209  }
210  }
211 }
212 
213 __global__ void roipool3d_forward(int batch_size,
214  int pts_num,
215  int boxes_num,
216  int feature_in_len,
217  int sampled_pts_num,
218  const float *xyz,
219  const int *pts_idx,
220  const float *pts_feature,
221  float *pooled_features,
222  int *pooled_empty_flag) {
223  // params xyz: (B, N, 3)
224  // params pts_idx: (B, M, 512)
225  // params pts_feature: (B, N, C)
226  // params pooled_features: (B, M, 512, 3+C)
227  // params pooled_empty_flag: (B, M)
228 
229  int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
230  int box_idx = blockIdx.y;
231  int bs_idx = blockIdx.z;
232 
233  if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
234  bs_idx >= batch_size) {
235  return;
236  }
237 
238  if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
239  return;
240  }
241 
242  int temp_idx = bs_idx * boxes_num * sampled_pts_num +
243  box_idx * sampled_pts_num + sample_pt_idx;
244  int src_pt_idx = pts_idx[temp_idx];
245  int dst_feature_offset = temp_idx * (3 + feature_in_len);
246 
247  for (int j = 0; j < 3; j++)
248  pooled_features[dst_feature_offset + j] =
249  xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
250 
251  int src_feature_offset =
252  bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
253  for (int j = 0; j < feature_in_len; j++)
254  pooled_features[dst_feature_offset + 3 + j] =
255  pts_feature[src_feature_offset + j];
256 }
257 
258 void roipool3dLauncher(int batch_size,
259  int pts_num,
260  int boxes_num,
261  int feature_in_len,
262  int sampled_pts_num,
263  const float *xyz,
264  const float *boxes3d,
265  const float *pts_feature,
266  float *pooled_features,
267  int *pooled_empty_flag) {
268  // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num,
269  // boxes_num);
270  int *pts_assign = NULL;
271  cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num *
272  sizeof(int)); // (batch_size, N, M)
273  // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num *
274  // sizeof(int));
275 
276  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num,
277  batch_size); // blockIdx.x(col), blockIdx.y(row)
278  dim3 threads(THREADS_PER_BLOCK);
279  assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num,
280  xyz, boxes3d, pts_assign);
281 
282  int *pts_idx = NULL;
283  cudaMalloc(&pts_idx,
284  batch_size * boxes_num * sampled_pts_num *
285  sizeof(int)); // (batch_size, M, sampled_pts_num)
286 
287  dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK),
288  batch_size); // blockIdx.x(col), blockIdx.y(row)
289  get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num,
290  sampled_pts_num, pts_assign, pts_idx,
291  pooled_empty_flag);
292 
293  dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
294  batch_size);
295  roipool3d_forward<<<blocks_pool, threads>>>(
296  batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
297  xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
298 
299  cudaFree(pts_assign);
300  cudaFree(pts_idx);
301 
302 #ifdef DEBUG
303  core::cuda::Synchronize(); // for using printf in kernel function
304 #endif
305 }
306 
307 } // namespace contrib
308 } // namespace ml
309 } // namespace cloudViewer