1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
11 #include "cloudViewer/core/CUDAUtils.h"
12 #include "ml/contrib/RoiPoolKernel.h"
14 namespace cloudViewer {
18 #define THREADS_PER_BLOCK 256
19 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
21 __device__ inline int pt_in_box3d(float x,
32 float x_rot, z_rot, cosa, sina, cy;
34 cy = bottom_y - h / 2.0;
35 if ((fabsf(x - cx) > max_dis) || (fabsf(y - cy) > h / 2.0) ||
36 (fabsf(z - cz) > max_dis)) {
41 x_rot = (x - cx) * cosa + (z - cz) * (-sina);
42 z_rot = (x - cx) * sina + (z - cz) * cosa;
44 in_flag = (x_rot >= -l / 2.0) & (x_rot <= l / 2.0) & (z_rot >= -w / 2.0) &
49 __global__ void roipool3d_forward(int batch_size,
56 const float *pts_feature,
57 float *pooled_features,
58 int *pooled_empty_flag) {
59 // params xyz: (B, N, 3)
60 // params boxes3d: (B, M, 7)
61 // params pts_feature: (B, N, C)
62 // params pooled_features: (B, M, 512, 3+C)
63 // params pooled_empty_flag: (B, M)
65 int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
66 if (boxes_idx >= boxes_num) {
70 for (int i = 0; i < batch_size; i++) {
72 for (int k = 0; k < pts_num; k++) {
73 int pt_offset = i * pts_num * 3 + k * 3;
74 int box_offset = i * boxes_num * 7 + boxes_idx * 7;
76 int cur_in_flag = pt_in_box3d(
77 xyz[pt_offset], xyz[pt_offset + 1], xyz[pt_offset + 2],
78 boxes3d[box_offset], boxes3d[box_offset + 1],
79 boxes3d[box_offset + 2], boxes3d[box_offset + 3],
80 boxes3d[box_offset + 4], boxes3d[box_offset + 5],
81 boxes3d[box_offset + 6], 10.0);
83 if (cnt < sampled_pts_num) {
84 int feature_out_offset =
85 i * boxes_num * sampled_pts_num *
86 (3 + feature_in_len) +
87 boxes_idx * sampled_pts_num * (3 + feature_in_len) +
88 cnt * (3 + feature_in_len);
90 int feature_in_offset =
91 i * pts_num * feature_in_len + k * feature_in_len;
94 for (int j = 0; j < 3; j++)
95 pooled_features[feature_out_offset + j] =
99 for (int j = 0; j < feature_in_len; j++)
100 pooled_features[feature_out_offset + 3 + j] =
101 pts_feature[feature_in_offset + j];
110 pooled_empty_flag[i * boxes_num + boxes_idx] = 1;
111 } else if (cnt < sampled_pts_num) {
112 // duplicate same points for sampling
113 for (int k = cnt; k < sampled_pts_num; k++) {
114 int duplicate_idx = k % cnt;
116 i * boxes_num * sampled_pts_num * (3 + feature_in_len) +
117 boxes_idx * sampled_pts_num * (3 + feature_in_len) +
118 duplicate_idx * (3 + feature_in_len);
120 i * boxes_num * sampled_pts_num * (3 + feature_in_len) +
121 boxes_idx * sampled_pts_num * (3 + feature_in_len) +
122 k * (3 + feature_in_len);
123 for (int j = 0; j < 3 + feature_in_len; j++)
124 pooled_features[dst_offset + j] =
125 pooled_features[src_offset + j];
131 __global__ void assign_pts_to_box3d(int batch_size,
135 const float *boxes3d,
137 // params xyz: (B, N, 3)
138 // params boxes3d: (B, M, 7)
139 // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
141 int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
142 int box_idx = blockIdx.y;
143 int bs_idx = blockIdx.z;
145 if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
149 bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
150 pts_assign[assign_idx] = 0;
152 int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
153 int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
156 pt_in_box3d(xyz[pt_offset], xyz[pt_offset + 1], xyz[pt_offset + 2],
157 boxes3d[box_offset], boxes3d[box_offset + 1],
158 boxes3d[box_offset + 2], boxes3d[box_offset + 3],
159 boxes3d[box_offset + 4], boxes3d[box_offset + 5],
160 boxes3d[box_offset + 6], 10.0);
162 pts_assign[assign_idx] = cur_in_flag;
163 // printf("bs=%d, pt=%d, in=%d\n", bs_idx, pt_idx, pts_assign[bs_idx *
164 // pts_num + pt_idx]);
167 __global__ void get_pooled_idx(int batch_size,
171 const int *pts_assign,
173 int *pooled_empty_flag) {
174 // params xyz: (B, N, 3)
175 // params pts_feature: (B, N, C)
176 // params pts_assign: (B, N)
177 // params pts_idx: (B, M, 512)
178 // params pooled_empty_flag: (B, M)
180 int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
181 if (boxes_idx >= boxes_num) {
185 int bs_idx = blockIdx.y;
188 for (int k = 0; k < pts_num; k++) {
189 if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
191 if (cnt < sampled_pts_num) {
192 pts_idx[bs_idx * boxes_num * sampled_pts_num +
193 boxes_idx * sampled_pts_num + cnt] = k;
201 pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
202 } else if (cnt < sampled_pts_num) {
203 // duplicate same points for sampling
204 for (int k = cnt; k < sampled_pts_num; k++) {
205 int duplicate_idx = k % cnt;
206 int base_offset = bs_idx * boxes_num * sampled_pts_num +
207 boxes_idx * sampled_pts_num;
208 pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
213 __global__ void roipool3d_forward(int batch_size,
220 const float *pts_feature,
221 float *pooled_features,
222 int *pooled_empty_flag) {
223 // params xyz: (B, N, 3)
224 // params pts_idx: (B, M, 512)
225 // params pts_feature: (B, N, C)
226 // params pooled_features: (B, M, 512, 3+C)
227 // params pooled_empty_flag: (B, M)
229 int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
230 int box_idx = blockIdx.y;
231 int bs_idx = blockIdx.z;
233 if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
234 bs_idx >= batch_size) {
238 if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
242 int temp_idx = bs_idx * boxes_num * sampled_pts_num +
243 box_idx * sampled_pts_num + sample_pt_idx;
244 int src_pt_idx = pts_idx[temp_idx];
245 int dst_feature_offset = temp_idx * (3 + feature_in_len);
247 for (int j = 0; j < 3; j++)
248 pooled_features[dst_feature_offset + j] =
249 xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
251 int src_feature_offset =
252 bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
253 for (int j = 0; j < feature_in_len; j++)
254 pooled_features[dst_feature_offset + 3 + j] =
255 pts_feature[src_feature_offset + j];
258 void roipool3dLauncher(int batch_size,
264 const float *boxes3d,
265 const float *pts_feature,
266 float *pooled_features,
267 int *pooled_empty_flag) {
268 // printf("batch_size=%d, pts_num=%d, boxes_num=%d\n", batch_size, pts_num,
270 int *pts_assign = NULL;
271 cudaMalloc(&pts_assign, batch_size * pts_num * boxes_num *
272 sizeof(int)); // (batch_size, N, M)
273 // cudaMemset(&pts_assign, -1, batch_size * pts_num * boxes_num *
276 dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num,
277 batch_size); // blockIdx.x(col), blockIdx.y(row)
278 dim3 threads(THREADS_PER_BLOCK);
279 assign_pts_to_box3d<<<blocks, threads>>>(batch_size, pts_num, boxes_num,
280 xyz, boxes3d, pts_assign);
284 batch_size * boxes_num * sampled_pts_num *
285 sizeof(int)); // (batch_size, M, sampled_pts_num)
287 dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK),
288 batch_size); // blockIdx.x(col), blockIdx.y(row)
289 get_pooled_idx<<<blocks2, threads>>>(batch_size, pts_num, boxes_num,
290 sampled_pts_num, pts_assign, pts_idx,
293 dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
295 roipool3d_forward<<<blocks_pool, threads>>>(
296 batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
297 xyz, pts_idx, pts_feature, pooled_features, pooled_empty_flag);
299 cudaFree(pts_assign);
303 core::cuda::Synchronize(); // for using printf in kernel function
307 } // namespace contrib
309 } // namespace cloudViewer