ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
Nms.cpp
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #include "ml/contrib/Nms.h"
9 
10 #include <Helper.h>
11 #include <tbb/parallel_for.h>
12 
13 #include <iostream>
14 #include <numeric>
15 
16 #include "ml/contrib/IoUImpl.h"
17 
18 namespace cloudViewer {
19 namespace ml {
20 namespace contrib {
21 
22 using namespace cloudViewer;
23 
24 template <typename T>
25 static std::vector<int64_t> SortIndexes(const T *values,
26  int64_t num,
27  bool descending = false) {
28  std::vector<int64_t> indices(num);
29  std::iota(indices.begin(), indices.end(), 0);
30  if (descending) {
31  std::stable_sort(indices.begin(), indices.end(),
32  [&values](int64_t i, int64_t j) {
33  return values[i] > values[j];
34  });
35  } else {
36  std::stable_sort(indices.begin(), indices.end(),
37  [&values](int64_t i, int64_t j) {
38  return values[i] < values[j];
39  });
40  }
41  return indices;
42 }
43 
44 static void AllPairsSortedIoU(const float *boxes,
45  const float *scores,
46  const int64_t *sort_indices,
47  uint64_t *mask,
48  int n,
49  double nms_overlap_thresh) {
50  const int num_block_cols = utility::DivUp(n, NMS_BLOCK_SIZE);
51  const int num_block_rows = utility::DivUp(n, NMS_BLOCK_SIZE);
52 
53  // We need the concept of "block" since the mask is a uint64_t binary bit
54  // map, and the block size is exactly 64x64. This is also consistent with
55  // the CUDA implementation.
56  tbb::parallel_for(
57  tbb::blocked_range<int>(0, num_block_cols),
58  [&](const tbb::blocked_range<int> &r) {
59  for (int block_col_idx = r.begin(); block_col_idx != r.end();
60  ++block_col_idx) {
61  for (int block_row_idx = 0; block_row_idx < num_block_rows;
62  ++block_row_idx) {
63  // Local block row size.
64  const int row_size =
65  fminf(n - block_row_idx * NMS_BLOCK_SIZE,
66  NMS_BLOCK_SIZE);
67  // Local block col size.
68  const int col_size =
69  fminf(n - block_col_idx * NMS_BLOCK_SIZE,
70  NMS_BLOCK_SIZE);
71 
72  // Comparing src and dst. With all blocks, all src and
73  // dst indices are compared. In one block, the following
74  // src and dst indices are compared:
75  // - src: BS * block_row_idx : BS * block_row_idx +
76  // row_size
77  // - dst: BS * block_col_idx : BS * block_col_idx +
78  // col_size
79  //
80  // Result:
81  // mask[i, j] is a 64-bit integer where mask[i, j][k] (k
82  // counted from right) is 1 iff box[i] overlaps with
83  // box[BS*j+k].
84  for (int src_idx = NMS_BLOCK_SIZE * block_row_idx;
85  src_idx <
86  NMS_BLOCK_SIZE * block_row_idx + row_size;
87  src_idx++) {
88  uint64_t t = 0;
89  for (int dst_idx = NMS_BLOCK_SIZE * block_col_idx;
90  dst_idx <
91  NMS_BLOCK_SIZE * block_col_idx + col_size;
92  dst_idx++) {
93  // Unlike the CUDA impl, both src_idx and
94  // dst_idx here are indexes to the global
95  // memory. Thus we need to compute the local
96  // index for dst_idx.
97  if (IoUBev2DWithMinAndMax(
98  boxes + sort_indices[src_idx] * 5,
99  boxes + sort_indices[dst_idx] * 5) >
100  nms_overlap_thresh) {
101  t |= 1ULL
102  << (dst_idx -
103  NMS_BLOCK_SIZE * block_col_idx);
104  }
105  }
106  mask[src_idx * num_block_cols + block_col_idx] = t;
107  }
108  }
109  }
110  });
111 }
112 
113 std::vector<int64_t> NmsCPUKernel(const float *boxes,
114  const float *scores,
115  int n,
116  double nms_overlap_thresh) {
117  std::vector<int64_t> sort_indices = SortIndexes(scores, n, true);
118 
119  const int num_block_cols = utility::DivUp(n, NMS_BLOCK_SIZE);
120 
121  // Call kernel. Results will be saved in masks.
122  // boxes: (n, 5)
123  // mask: (n, n/BS)
124  std::vector<uint64_t> mask_vec(n * num_block_cols);
125  uint64_t *mask = mask_vec.data();
126  AllPairsSortedIoU(boxes, scores, sort_indices.data(), mask, n,
127  nms_overlap_thresh);
128 
129  // Write to keep. remv_cpu has n bits in total. If the bit is 1, the
130  // corresponding box will be removed.
131  std::vector<uint64_t> remv_cpu(num_block_cols, 0);
132  std::vector<int64_t> keep_indices;
133  for (int i = 0; i < n; i++) {
134  int block_col_idx = i / NMS_BLOCK_SIZE;
135  int inner_block_col_idx = i % NMS_BLOCK_SIZE; // threadIdx.x
136 
137  // Querying the i-th bit in remv_cpu, counted from the right.
138  // - remv_cpu[block_col_idx]: the block bitmap containing the query.
139  // - 1ULL << inner_block_col_idx: the one-hot bitmap to extract i.
140  if (!(remv_cpu[block_col_idx] & (1ULL << inner_block_col_idx))) {
141  // Keep the i-th box.
142  keep_indices.push_back(sort_indices[i]);
143 
144  // Any box that overlaps with the i-th box will be removed.
145  uint64_t *p = mask + i * num_block_cols;
146  for (int j = block_col_idx; j < num_block_cols; j++) {
147  remv_cpu[j] |= p[j];
148  }
149  }
150  }
151 
152  return keep_indices;
153 }
154 
155 } // namespace contrib
156 } // namespace ml
157 } // namespace cloudViewer
Helper functions for the ml ops.
static void AllPairsSortedIoU(const float *boxes, const float *scores, const int64_t *sort_indices, uint64_t *mask, int n, double nms_overlap_thresh)
Definition: Nms.cpp:44
static std::vector< int64_t > SortIndexes(const T *values, int64_t num, bool descending=false)
Definition: Nms.cpp:25
constexpr int NMS_BLOCK_SIZE
Definition: IoUImpl.h:19
std::vector< int64_t > NmsCPUKernel(const float *boxes, const float *scores, int n, double nms_overlap_thresh)
Definition: Nms.cpp:113
int DivUp(int x, int y)
Computes the quotient of x/y with rounding up.
Definition: Helper.h:210
Generic file read and write utility for python interface.