11 #include <tbb/parallel_for.h>
27 bool descending =
false) {
28 std::vector<int64_t> indices(num);
29 std::iota(indices.begin(), indices.end(), 0);
31 std::stable_sort(indices.begin(), indices.end(),
32 [&values](int64_t i, int64_t j) {
33 return values[i] > values[j];
36 std::stable_sort(indices.begin(), indices.end(),
37 [&values](int64_t i, int64_t j) {
38 return values[i] < values[j];
46 const int64_t *sort_indices,
49 double nms_overlap_thresh) {
57 tbb::blocked_range<int>(0, num_block_cols),
58 [&](
const tbb::blocked_range<int> &r) {
59 for (
int block_col_idx = r.begin(); block_col_idx != r.end();
61 for (int block_row_idx = 0; block_row_idx < num_block_rows;
65 fminf(n - block_row_idx * NMS_BLOCK_SIZE,
69 fminf(n - block_col_idx * NMS_BLOCK_SIZE,
84 for (int src_idx = NMS_BLOCK_SIZE * block_row_idx;
86 NMS_BLOCK_SIZE * block_row_idx + row_size;
89 for (int dst_idx = NMS_BLOCK_SIZE * block_col_idx;
91 NMS_BLOCK_SIZE * block_col_idx + col_size;
97 if (IoUBev2DWithMinAndMax(
98 boxes + sort_indices[src_idx] * 5,
99 boxes + sort_indices[dst_idx] * 5) >
100 nms_overlap_thresh) {
103 NMS_BLOCK_SIZE * block_col_idx);
106 mask[src_idx * num_block_cols + block_col_idx] = t;
116 double nms_overlap_thresh) {
117 std::vector<int64_t> sort_indices =
SortIndexes(scores, n,
true);
124 std::vector<uint64_t> mask_vec(n * num_block_cols);
125 uint64_t *mask = mask_vec.data();
131 std::vector<uint64_t> remv_cpu(num_block_cols, 0);
132 std::vector<int64_t> keep_indices;
133 for (
int i = 0; i < n; i++) {
140 if (!(remv_cpu[block_col_idx] & (1ULL << inner_block_col_idx))) {
142 keep_indices.push_back(sort_indices[i]);
145 uint64_t *p = mask + i * num_block_cols;
146 for (
int j = block_col_idx; j < num_block_cols; j++) {
Helper functions for the ml ops.
static void AllPairsSortedIoU(const float *boxes, const float *scores, const int64_t *sort_indices, uint64_t *mask, int n, double nms_overlap_thresh)
static std::vector< int64_t > SortIndexes(const T *values, int64_t num, bool descending=false)
constexpr int NMS_BLOCK_SIZE
std::vector< int64_t > NmsCPUKernel(const float *boxes, const float *scores, int n, double nms_overlap_thresh)
int DivUp(int x, int y)
Computes the quotient of x/y with rounding up.
Generic file read and write utility for python interface.