1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
8 #define _USE_MATH_DEFINES
16 #include "mvs/patch_match_cuda.h"
17 #include "util/cuda.h"
18 #include "util/cudacc.h"
19 #include "util/logging.h"
21 // The number of threads per Cuda thread. Warning: Do not change this value,
22 // since the templated window sizes rely on this value.
23 #define THREADS_PER_BLOCK 32
25 // We must not include "util/math.h" to avoid any Eigen includes here,
26 // since Visual Studio cannot compile some of the Eigen/Boost expressions.
28 #define DEG2RAD(deg) deg * 0.0174532925199432
34 // Calibration of reference image as {fx, cx, fy, cy}.
35 __constant__ float ref_K[4];
36 // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}.
37 __constant__ float ref_inv_K[4];
39 __device__ inline void Mat33DotVec3(const float mat[9],
42 result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2];
43 result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2];
44 result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2];
47 __device__ inline void Mat33DotVec3Homogeneous(const float mat[9],
50 const float inv_z = 1.0f / (mat[6] * vec[0] + mat[7] * vec[1] + mat[8]);
51 result[0] = inv_z * (mat[0] * vec[0] + mat[1] * vec[1] + mat[2]);
52 result[1] = inv_z * (mat[3] * vec[0] + mat[4] * vec[1] + mat[5]);
55 __device__ inline float DotProduct3(const float vec1[3], const float vec2[3]) {
56 return vec1[0] * vec2[0] + vec1[1] * vec2[1] + vec1[2] * vec2[2];
59 __device__ inline float GenerateRandomDepth(const float depth_min,
60 const float depth_max,
61 curandState* rand_state) {
62 return curand_uniform(rand_state) * (depth_max - depth_min) + depth_min;
65 __device__ inline void GenerateRandomNormal(const int row,
67 curandState* rand_state,
69 // Unbiased sampling of normal, according to George Marsaglia, "Choosing a
70 // Point from the Surface of a Sphere", 1972.
75 v1 = 2.0f * curand_uniform(rand_state) - 1.0f;
76 v2 = 2.0f * curand_uniform(rand_state) - 1.0f;
77 s = v1 * v1 + v2 * v2;
80 const float s_norm = sqrt(1.0f - s);
81 normal[0] = 2.0f * v1 * s_norm;
82 normal[1] = 2.0f * v2 * s_norm;
83 normal[2] = 1.0f - 2.0f * s;
85 // Make sure normal is looking away from camera.
86 const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1],
87 ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
88 if (DotProduct3(normal, view_ray) > 0) {
89 normal[0] = -normal[0];
90 normal[1] = -normal[1];
91 normal[2] = -normal[2];
95 __device__ inline float PerturbDepth(const float perturbation,
97 curandState* rand_state) {
98 const float depth_min = (1.0f - perturbation) * depth;
99 const float depth_max = (1.0f + perturbation) * depth;
100 return GenerateRandomDepth(depth_min, depth_max, rand_state);
103 __device__ inline void PerturbNormal(const int row,
105 const float perturbation,
106 const float normal[3],
107 curandState* rand_state,
108 float perturbed_normal[3],
109 const int num_trials = 0) {
110 // Perturbation rotation angles.
111 const float a1 = (curand_uniform(rand_state) - 0.5f) * perturbation;
112 const float a2 = (curand_uniform(rand_state) - 0.5f) * perturbation;
113 const float a3 = (curand_uniform(rand_state) - 0.5f) * perturbation;
115 const float sin_a1 = sin(a1);
116 const float sin_a2 = sin(a2);
117 const float sin_a3 = sin(a3);
118 const float cos_a1 = cos(a1);
119 const float cos_a2 = cos(a2);
120 const float cos_a3 = cos(a3);
124 R[0] = cos_a2 * cos_a3;
125 R[1] = -cos_a2 * sin_a3;
127 R[3] = cos_a1 * sin_a3 + cos_a3 * sin_a1 * sin_a2;
128 R[4] = cos_a1 * cos_a3 - sin_a1 * sin_a2 * sin_a3;
129 R[5] = -cos_a2 * sin_a1;
130 R[6] = sin_a1 * sin_a3 - cos_a1 * cos_a3 * sin_a2;
131 R[7] = cos_a3 * sin_a1 + cos_a1 * sin_a2 * sin_a3;
132 R[8] = cos_a1 * cos_a2;
134 // Perturb the normal vector.
135 Mat33DotVec3(R, normal, perturbed_normal);
137 // Make sure the perturbed normal is still looking in the same direction as
138 // the viewing direction, otherwise try again but with smaller perturbation.
139 const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1],
140 ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
141 if (DotProduct3(perturbed_normal, view_ray) >= 0.0f) {
142 const int kMaxNumTrials = 3;
143 if (num_trials < kMaxNumTrials) {
144 PerturbNormal(row, col, 0.5f * perturbation, normal, rand_state,
145 perturbed_normal, num_trials + 1);
148 perturbed_normal[0] = normal[0];
149 perturbed_normal[1] = normal[1];
150 perturbed_normal[2] = normal[2];
155 // Make sure normal has unit norm.
156 const float inv_norm =
157 rsqrt(DotProduct3(perturbed_normal, perturbed_normal));
158 perturbed_normal[0] *= inv_norm;
159 perturbed_normal[1] *= inv_norm;
160 perturbed_normal[2] *= inv_norm;
163 __device__ inline void ComputePointAtDepth(const float row,
167 point[0] = depth * (ref_inv_K[0] * col + ref_inv_K[1]);
168 point[1] = depth * (ref_inv_K[2] * row + ref_inv_K[3]);
172 // Transfer depth on plane from viewing ray at row1 to row2. The returned
173 // depth is the intersection of the viewing ray through row2 with the plane
174 // at row1 defined by the given depth and normal.
175 __device__ inline float PropagateDepth(const float depth1,
176 const float normal1[3],
179 // Point along first viewing ray.
180 const float x1 = depth1 * (ref_inv_K[2] * row1 + ref_inv_K[3]);
181 const float y1 = depth1;
182 // Point on plane defined by point along first viewing ray and plane
184 const float x2 = x1 + normal1[2];
185 const float y2 = y1 - normal1[1];
187 // Origin of second viewing ray.
188 // const float x3 = 0.0f;
189 // const float y3 = 0.0f;
190 // Point on second viewing ray.
191 const float x4 = ref_inv_K[2] * row2 + ref_inv_K[3];
192 // const float y4 = 1.0f;
194 // Intersection of the lines ((x1, y1), (x2, y2)) and ((x3, y3), (x4, y4)).
195 const float denom = x2 - x1 + x4 * (y1 - y2);
196 constexpr float kEps = 1e-5f;
197 if (abs(denom) < kEps) {
200 const float nom = y1 * x2 - x1 * y2;
204 // First, compute triangulation angle between reference and source image for 3D
205 // point. Second, compute incident angle between viewing direction of source
206 // image and normal direction of 3D point. Both angles are cosine distances.
207 __device__ inline void ComputeViewingAngles(
208 const cudaTextureObject_t poses_texture,
209 const float point[3],
210 const float normal[3],
212 float* cos_triangulation_angle,
213 float* cos_incident_angle) {
214 *cos_triangulation_angle = 0.0f;
215 *cos_incident_angle = 0.0f;
217 // Projection center of source image.
219 for (int i = 0; i < 3; ++i) {
220 C[i] = tex2D<float>(poses_texture, i + 16, image_idx);
223 // Ray from point to camera.
224 const float SX[3] = {C[0] - point[0], C[1] - point[1], C[2] - point[2]};
226 // Length of ray from reference image to point.
227 const float RX_inv_norm = rsqrt(DotProduct3(point, point));
229 // Length of ray from source image to point.
230 const float SX_inv_norm = rsqrt(DotProduct3(SX, SX));
232 *cos_incident_angle = DotProduct3(SX, normal) * SX_inv_norm;
233 *cos_triangulation_angle =
234 DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm;
237 __device__ inline void ComposeHomography(
238 const cudaTextureObject_t poses_texture,
243 const float normal[3],
245 // Calibration of source image.
247 for (int i = 0; i < 4; ++i) {
248 K[i] = tex2D<float>(poses_texture, i, image_idx);
251 // Relative rotation between reference and source image.
253 for (int i = 0; i < 9; ++i) {
254 R[i] = tex2D<float>(poses_texture, i + 4, image_idx);
257 // Relative translation between reference and source image.
259 for (int i = 0; i < 3; ++i) {
260 T[i] = tex2D<float>(poses_texture, i + 13, image_idx);
263 // Distance to the plane.
266 (normal[0] * (ref_inv_K[0] * col + ref_inv_K[1]) +
267 normal[1] * (ref_inv_K[2] * row + ref_inv_K[3]) + normal[2]);
268 const float inv_dist = 1.0f / dist;
270 const float inv_dist_N0 = inv_dist * normal[0];
271 const float inv_dist_N1 = inv_dist * normal[1];
272 const float inv_dist_N2 = inv_dist * normal[2];
274 // Homography as H = K * (R - T * n' / d) * Kref^-1.
275 H[0] = ref_inv_K[0] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
276 K[1] * (R[6] + inv_dist_N0 * T[2]));
277 H[1] = ref_inv_K[2] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
278 K[1] * (R[7] + inv_dist_N1 * T[2]));
279 H[2] = K[0] * (R[2] + inv_dist_N2 * T[0]) +
280 K[1] * (R[8] + inv_dist_N2 * T[2]) +
281 ref_inv_K[1] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
282 K[1] * (R[6] + inv_dist_N0 * T[2])) +
283 ref_inv_K[3] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
284 K[1] * (R[7] + inv_dist_N1 * T[2]));
285 H[3] = ref_inv_K[0] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
286 K[3] * (R[6] + inv_dist_N0 * T[2]));
287 H[4] = ref_inv_K[2] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
288 K[3] * (R[7] + inv_dist_N1 * T[2]));
289 H[5] = K[2] * (R[5] + inv_dist_N2 * T[1]) +
290 K[3] * (R[8] + inv_dist_N2 * T[2]) +
291 ref_inv_K[1] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
292 K[3] * (R[6] + inv_dist_N0 * T[2])) +
293 ref_inv_K[3] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
294 K[3] * (R[7] + inv_dist_N1 * T[2]));
295 H[6] = ref_inv_K[0] * (R[6] + inv_dist_N0 * T[2]);
296 H[7] = ref_inv_K[2] * (R[7] + inv_dist_N1 * T[2]);
297 H[8] = R[8] + ref_inv_K[1] * (R[6] + inv_dist_N0 * T[2]) +
298 ref_inv_K[3] * (R[7] + inv_dist_N1 * T[2]) + inv_dist_N2 * T[2];
301 // Each thread in the current warp / thread block reads in 3 columns of the
302 // reference image. The shared memory holds 3 * THREADS_PER_BLOCK columns and
303 // kWindowSize rows of the reference image. Each thread copies every
304 // THREADS_PER_BLOCK-th column from global to shared memory offset by its ID.
305 // For example, if THREADS_PER_BLOCK = 32, then thread 0 reads columns 0, 32, 64
306 // and thread 1 columns 1, 33, 65. When computing the photoconsistency, which is
307 // shared among each thread block, each thread can then read the reference image
308 // colors from shared memory. Note that this limits the window radius to a
309 // maximum of THREADS_PER_BLOCK.
310 template <int kWindowSize>
311 struct LocalRefImage {
312 const static int kWindowRadius = kWindowSize / 2;
313 const static int kThreadBlockRadius = 1;
314 const static int kThreadBlockSize = 2 * kThreadBlockRadius + 1;
315 const static int kNumRows = kWindowSize;
316 const static int kNumColumns = kThreadBlockSize * THREADS_PER_BLOCK;
317 const static int kDataSize = kNumRows * kNumColumns;
319 __device__ explicit LocalRefImage(
320 const cudaTextureObject_t ref_image_texture)
321 : ref_image_texture_(ref_image_texture) {}
323 float* data = nullptr;
325 __device__ inline void Read(const int row) {
326 // For the first row, read the entire block into shared memory. For all
327 // consecutive rows, it is only necessary to shift the rows in shared
328 // memory up by one element and then read in a new row at the bottom of
329 // the shared memory. Note that this assumes that the calling loop
330 // starts with the first row and then consecutively reads in the next
333 const int thread_id = threadIdx.x;
334 const int thread_block_first_id = blockDim.x * blockIdx.x;
336 const int local_col_start = thread_id;
337 const int global_col_start = thread_block_first_id -
338 kThreadBlockRadius * THREADS_PER_BLOCK +
342 int global_row = row - kWindowRadius;
343 for (int local_row = 0; local_row < kNumRows;
344 ++local_row, ++global_row) {
345 int local_col = local_col_start;
346 int global_col = global_col_start;
348 for (int block = 0; block < kThreadBlockSize; ++block) {
349 data[local_row * kNumColumns + local_col] = tex2D<float>(
350 ref_image_texture_, global_col, global_row);
351 local_col += THREADS_PER_BLOCK;
352 global_col += THREADS_PER_BLOCK;
356 // Move rows in shared memory up by one row.
357 for (int local_row = 1; local_row < kNumRows; ++local_row) {
358 int local_col = local_col_start;
360 for (int block = 0; block < kThreadBlockSize; ++block) {
361 data[(local_row - 1) * kNumColumns + local_col] =
362 data[local_row * kNumColumns + local_col];
363 local_col += THREADS_PER_BLOCK;
367 // Read next row into the last row of shared memory.
368 const int local_row = kNumRows - 1;
369 const int global_row = row + kWindowRadius;
370 int local_col = local_col_start;
371 int global_col = global_col_start;
373 for (int block = 0; block < kThreadBlockSize; ++block) {
374 data[local_row * kNumColumns + local_col] = tex2D<float>(
375 ref_image_texture_, global_col, global_row);
376 local_col += THREADS_PER_BLOCK;
377 global_col += THREADS_PER_BLOCK;
383 const cudaTextureObject_t ref_image_texture_;
386 // The return values is 1 - NCC, so the range is [0, 2], the smaller the
387 // value, the better the color consistency.
388 template <int kWindowSize, int kWindowStep>
389 struct PhotoConsistencyCostComputer {
390 const static int kWindowRadius = kWindowSize / 2;
392 __device__ PhotoConsistencyCostComputer(
393 const cudaTextureObject_t ref_image_texture,
394 const cudaTextureObject_t src_images_texture,
395 const cudaTextureObject_t poses_texture,
396 const float sigma_spatial,
397 const float sigma_color)
398 : local_ref_image(ref_image_texture),
399 src_images_texture_(src_images_texture),
400 poses_texture_(poses_texture),
401 bilateral_weight_computer_(sigma_spatial, sigma_color) {}
403 // Maximum photo consistency cost as 1 - min(NCC).
404 const float kMaxCost = 2.0f;
406 // Thread warp local reference image data around current patch.
407 typedef LocalRefImage<kWindowSize> LocalRefImageType;
408 LocalRefImageType local_ref_image;
410 // Precomputed sum of raw and squared image intensities.
411 float local_ref_sum = 0.0f;
412 float local_ref_squared_sum = 0.0f;
414 // Index of source image.
415 int src_image_idx = -1;
417 // Center position of patch in reference image.
421 // Depth and normal for which to warp patch.
423 const float* normal = nullptr;
425 __device__ inline void Read(const int row) {
426 local_ref_image.Read(row);
430 __device__ inline float Compute() const {
432 ComposeHomography(poses_texture_, src_image_idx, row, col, depth,
436 for (int i = 0; i < 8; ++i) {
437 tform_step[i] = kWindowStep * tform[i];
440 const int thread_id = threadIdx.x;
441 const int row_start = row - kWindowRadius;
442 const int col_start = col - kWindowRadius;
444 float col_src = tform[0] * col_start + tform[1] * row_start + tform[2];
445 float row_src = tform[3] * col_start + tform[4] * row_start + tform[5];
446 float z = tform[6] * col_start + tform[7] * row_start + tform[8];
447 float base_col_src = col_src;
448 float base_row_src = row_src;
451 int ref_image_idx = THREADS_PER_BLOCK - kWindowRadius + thread_id;
452 int ref_image_base_idx = ref_image_idx;
454 const float ref_center_color =
455 local_ref_image.data[ref_image_idx +
456 kWindowRadius * 3 * THREADS_PER_BLOCK +
458 const float ref_color_sum = local_ref_sum;
459 const float ref_color_squared_sum = local_ref_squared_sum;
460 float src_color_sum = 0.0f;
461 float src_color_squared_sum = 0.0f;
462 float src_ref_color_sum = 0.0f;
463 float bilateral_weight_sum = 0.0f;
465 for (int row = -kWindowRadius; row <= kWindowRadius;
466 row += kWindowStep) {
467 for (int col = -kWindowRadius; col <= kWindowRadius;
468 col += kWindowStep) {
469 const float inv_z = 1.0f / z;
470 const float norm_col_src = inv_z * col_src + 0.5f;
471 const float norm_row_src = inv_z * row_src + 0.5f;
472 const float ref_color = local_ref_image.data[ref_image_idx];
473 const float src_color =
474 tex2DLayered<float>(src_images_texture_, norm_col_src,
475 norm_row_src, src_image_idx);
477 const float bilateral_weight =
478 bilateral_weight_computer_.Compute(
479 row, col, ref_center_color, ref_color);
481 const float bilateral_weight_src = bilateral_weight * src_color;
483 src_color_sum += bilateral_weight_src;
484 src_color_squared_sum += bilateral_weight_src * src_color;
485 src_ref_color_sum += bilateral_weight_src * ref_color;
486 bilateral_weight_sum += bilateral_weight;
488 ref_image_idx += kWindowStep;
490 // Accumulate warped source coordinates per row to reduce
491 // numerical errors. Note that this is necessary since
492 // coordinates usually are in the order of 1000s as opposed to
493 // the color values which are normalized to the range [0, 1].
494 col_src += tform_step[0];
495 row_src += tform_step[3];
499 ref_image_base_idx += kWindowStep * 3 * THREADS_PER_BLOCK;
500 ref_image_idx = ref_image_base_idx;
502 base_col_src += tform_step[1];
503 base_row_src += tform_step[4];
504 base_z += tform_step[7];
506 col_src = base_col_src;
507 row_src = base_row_src;
511 const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum;
512 src_color_sum *= inv_bilateral_weight_sum;
513 src_color_squared_sum *= inv_bilateral_weight_sum;
514 src_ref_color_sum *= inv_bilateral_weight_sum;
516 const float ref_color_var =
517 ref_color_squared_sum - ref_color_sum * ref_color_sum;
518 const float src_color_var =
519 src_color_squared_sum - src_color_sum * src_color_sum;
521 // Based on Jensen's Inequality for convex functions, the variance
522 // should always be larger than 0. Do not make this threshold smaller.
523 constexpr float kMinVar = 1e-5f;
524 if (ref_color_var < kMinVar || src_color_var < kMinVar) {
527 const float src_ref_color_covar =
528 src_ref_color_sum - ref_color_sum * src_color_sum;
529 const float src_ref_color_var = sqrt(ref_color_var * src_color_var);
530 return max(0.0f, min(kMaxCost, 1.0f - src_ref_color_covar /
536 const cudaTextureObject_t src_images_texture_;
537 const cudaTextureObject_t poses_texture_;
538 const BilateralWeightComputer bilateral_weight_computer_;
541 __device__ inline float ComputeGeomConsistencyCost(
542 const cudaTextureObject_t poses_texture,
543 const cudaTextureObject_t src_depth_maps_texture,
548 const float max_cost) {
549 // Extract projection matrices for source image.
551 for (int i = 0; i < 12; ++i) {
552 P[i] = tex2D<float>(poses_texture, i + 19, image_idx);
555 for (int i = 0; i < 12; ++i) {
556 inv_P[i] = tex2D<float>(poses_texture, i + 31, image_idx);
559 // Project point in reference image to world.
560 float forward_point[3];
561 ComputePointAtDepth(row, col, depth, forward_point);
563 // Project world point to source image.
564 const float inv_forward_z =
565 1.0f / (P[8] * forward_point[0] + P[9] * forward_point[1] +
566 P[10] * forward_point[2] + P[11]);
568 inv_forward_z * (P[0] * forward_point[0] + P[1] * forward_point[1] +
569 P[2] * forward_point[2] + P[3]);
571 inv_forward_z * (P[4] * forward_point[0] + P[5] * forward_point[1] +
572 P[6] * forward_point[2] + P[7]);
574 // Extract depth in source image.
575 const float src_depth = tex2DLayered<float>(
576 src_depth_maps_texture, src_col + 0.5f, src_row + 0.5f, image_idx);
578 // Projection outside of source image.
579 if (src_depth == 0.0f) {
583 // Project point in source image to world.
584 src_col *= src_depth;
585 src_row *= src_depth;
586 const float backward_point_x = inv_P[0] * src_col + inv_P[1] * src_row +
587 inv_P[2] * src_depth + inv_P[3];
588 const float backward_point_y = inv_P[4] * src_col + inv_P[5] * src_row +
589 inv_P[6] * src_depth + inv_P[7];
590 const float backward_point_z = inv_P[8] * src_col + inv_P[9] * src_row +
591 inv_P[10] * src_depth + inv_P[11];
592 const float inv_backward_point_z = 1.0f / backward_point_z;
594 // Project world point back to reference image.
595 const float backward_col =
596 inv_backward_point_z *
597 (ref_K[0] * backward_point_x + ref_K[1] * backward_point_z);
598 const float backward_row =
599 inv_backward_point_z *
600 (ref_K[2] * backward_point_y + ref_K[3] * backward_point_z);
602 // Return truncated reprojection error between original observation and
603 // the forward-backward projected observation.
604 const float diff_col = col - backward_col;
605 const float diff_row = row - backward_row;
606 return min(max_cost, sqrt(diff_col * diff_col + diff_row * diff_row));
609 // Find index of minimum in given values.
610 template <int kNumCosts>
611 __device__ inline int FindMinCost(const float costs[kNumCosts]) {
612 float min_cost = costs[0];
613 int min_cost_idx = 0;
614 for (int idx = 1; idx < kNumCosts; ++idx) {
615 if (costs[idx] <= min_cost) {
616 min_cost = costs[idx];
623 __device__ inline void TransformPDFToCDF(float* probs, const int num_probs) {
624 float prob_sum = 0.0f;
625 for (int i = 0; i < num_probs; ++i) {
626 prob_sum += probs[i];
628 const float inv_prob_sum = 1.0f / prob_sum;
630 float cum_prob = 0.0f;
631 for (int i = 0; i < num_probs; ++i) {
632 const float prob = probs[i] * inv_prob_sum;
638 class LikelihoodComputer {
640 __device__ LikelihoodComputer(const float ncc_sigma,
641 const float min_triangulation_angle,
642 const float incident_angle_sigma)
643 : cos_min_triangulation_angle_(cos(min_triangulation_angle)),
644 inv_incident_angle_sigma_square_(
645 -0.5f / (incident_angle_sigma * incident_angle_sigma)),
646 inv_ncc_sigma_square_(-0.5f / (ncc_sigma * ncc_sigma)),
647 ncc_norm_factor_(ComputeNCCCostNormFactor(ncc_sigma)) {}
649 // Compute forward message from current cost and forward message of
650 // previous / neighboring pixel.
651 __device__ float ComputeForwardMessage(const float cost,
652 const float prev) const {
653 return ComputeMessage<true>(cost, prev);
656 // Compute backward message from current cost and backward message of
657 // previous / neighboring pixel.
658 __device__ float ComputeBackwardMessage(const float cost,
659 const float prev) const {
660 return ComputeMessage<false>(cost, prev);
663 // Compute the selection probability from the forward and backward message.
664 __device__ inline float ComputeSelProb(const float alpha,
667 const float prev_weight) const {
668 const float zn0 = (1.0f - alpha) * (1.0f - beta);
669 const float zn1 = alpha * beta;
670 const float curr = zn1 / (zn0 + zn1);
671 return prev_weight * prev + (1.0f - prev_weight) * curr;
674 // Compute NCC probability. Note that cost = 1 - NCC.
675 __device__ inline float ComputeNCCProb(const float cost) const {
676 return exp(cost * cost * inv_ncc_sigma_square_) * ncc_norm_factor_;
679 // Compute the triangulation angle probability.
680 __device__ inline float ComputeTriProb(
681 const float cos_triangulation_angle) const {
682 const float abs_cos_triangulation_angle = abs(cos_triangulation_angle);
683 if (abs_cos_triangulation_angle > cos_min_triangulation_angle_) {
685 1.0f - (1.0f - abs_cos_triangulation_angle) /
686 (1.0f - cos_min_triangulation_angle_);
687 const float likelihood = 1.0f - scaled * scaled;
688 return min(1.0f, max(0.0f, likelihood));
694 // Compute the incident angle probability.
695 __device__ inline float ComputeIncProb(
696 const float cos_incident_angle) const {
697 const float x = 1.0f - max(0.0f, cos_incident_angle);
698 return exp(x * x * inv_incident_angle_sigma_square_);
701 // Compute the warping/resolution prior probability.
702 template <int kWindowSize>
703 __device__ inline float ComputeResolutionProb(const float H[9],
705 const float col) const {
706 const int kWindowRadius = kWindowSize / 2;
708 // Warp corners of patch in reference image to source image.
710 const float ref1[2] = {col - kWindowRadius, row - kWindowRadius};
711 Mat33DotVec3Homogeneous(H, ref1, src1);
713 const float ref2[2] = {col - kWindowRadius, row + kWindowRadius};
714 Mat33DotVec3Homogeneous(H, ref2, src2);
716 const float ref3[2] = {col + kWindowRadius, row + kWindowRadius};
717 Mat33DotVec3Homogeneous(H, ref3, src3);
719 const float ref4[2] = {col + kWindowRadius, row - kWindowRadius};
720 Mat33DotVec3Homogeneous(H, ref4, src4);
722 // Compute area of patches in reference and source image.
723 const float ref_area = kWindowSize * kWindowSize;
724 const float src_area =
726 (src1[0] * src2[1] - src2[0] * src1[1] - src1[0] * src4[1] +
727 src2[0] * src3[1] - src3[0] * src2[1] + src4[0] * src1[1] +
728 src3[0] * src4[1] - src4[0] * src3[1]));
730 if (ref_area > src_area) {
731 return src_area / ref_area;
733 return ref_area / src_area;
738 // The normalization for the likelihood function, i.e. the normalization for
739 // the prior on the matching cost.
740 __device__ static inline float ComputeNCCCostNormFactor(
741 const float ncc_sigma) {
742 // A = sqrt(2pi)*sigma/2*erf(sqrt(2)/sigma)
743 // erf(x) = 2/sqrt(pi) * integral from 0 to x of exp(-t^2) dt
744 return 2.0f / (sqrt(2.0f * M_PI) * ncc_sigma *
745 erff(2.0f / (ncc_sigma * 1.414213562f)));
748 // Compute the forward or backward message.
749 template <bool kForward>
750 __device__ inline float ComputeMessage(const float cost,
751 const float prev) const {
752 constexpr float kUniformProb = 0.5f;
753 constexpr float kNoChangeProb = 0.99999f;
754 const float kChangeProb = 1.0f - kNoChangeProb;
755 const float emission = ComputeNCCProb(cost);
757 float zn0; // Message for selection probability = 0.
758 float zn1; // Message for selection probability = 1.
760 zn0 = (prev * kChangeProb + (1.0f - prev) * kNoChangeProb) *
762 zn1 = (prev * kNoChangeProb + (1.0f - prev) * kChangeProb) *
765 zn0 = prev * emission * kChangeProb +
766 (1.0f - prev) * kUniformProb * kNoChangeProb;
767 zn1 = prev * emission * kNoChangeProb +
768 (1.0f - prev) * kUniformProb * kChangeProb;
771 return zn1 / (zn0 + zn1);
774 const float cos_min_triangulation_angle_;
775 const float inv_incident_angle_sigma_square_;
776 const float inv_ncc_sigma_square_;
777 const float ncc_norm_factor_;
780 // Rotate normals by 90deg around z-axis in counter-clockwise direction.
781 __global__ void InitNormalMap(GpuMat<float> normal_map,
782 GpuMat<curandState> rand_state_map) {
783 const int row = blockDim.y * blockIdx.y + threadIdx.y;
784 const int col = blockDim.x * blockIdx.x + threadIdx.x;
785 if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) {
786 curandState rand_state = rand_state_map.Get(row, col);
788 GenerateRandomNormal(row, col, &rand_state, normal);
789 normal_map.SetSlice(row, col, normal);
790 rand_state_map.Set(row, col, rand_state);
794 // Rotate normals by 90deg around z-axis in counter-clockwise direction.
795 __global__ void RotateNormalMap(GpuMat<float> normal_map) {
796 const int row = blockDim.y * blockIdx.y + threadIdx.y;
797 const int col = blockDim.x * blockIdx.x + threadIdx.x;
798 if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) {
800 normal_map.GetSlice(row, col, normal);
801 float rotated_normal[3];
802 rotated_normal[0] = normal[1];
803 rotated_normal[1] = -normal[0];
804 rotated_normal[2] = normal[2];
805 normal_map.SetSlice(row, col, rotated_normal);
809 template <int kWindowSize, int kWindowStep>
810 __global__ void ComputeInitialCost(GpuMat<float> cost_map,
811 const GpuMat<float> depth_map,
812 const GpuMat<float> normal_map,
813 const cudaTextureObject_t ref_image_texture,
814 const GpuMat<float> ref_sum_image,
815 const GpuMat<float> ref_squared_sum_image,
816 const cudaTextureObject_t src_images_texture,
817 const cudaTextureObject_t poses_texture,
818 const float sigma_spatial,
819 const float sigma_color) {
820 const int col = blockDim.x * blockIdx.x + threadIdx.x;
822 typedef PhotoConsistencyCostComputer<kWindowSize, kWindowStep>
823 PhotoConsistencyCostComputerType;
824 PhotoConsistencyCostComputerType pcc_computer(
825 ref_image_texture, src_images_texture, poses_texture, sigma_spatial,
827 pcc_computer.col = col;
829 __shared__ float local_ref_image_data
830 [PhotoConsistencyCostComputerType::LocalRefImageType::kDataSize];
831 pcc_computer.local_ref_image.data = &local_ref_image_data[0];
833 float normal[3] = {0};
834 pcc_computer.normal = normal;
836 for (int row = 0; row < cost_map.GetHeight(); ++row) {
837 // Note that this must be executed even for pixels outside the borders,
838 // since pixels are used in the local neighborhood of the current pixel.
839 pcc_computer.Read(row);
841 if (col < cost_map.GetWidth()) {
842 pcc_computer.depth = depth_map.Get(row, col);
843 normal_map.GetSlice(row, col, normal);
845 pcc_computer.row = row;
846 pcc_computer.local_ref_sum = ref_sum_image.Get(row, col);
847 pcc_computer.local_ref_squared_sum =
848 ref_squared_sum_image.Get(row, col);
850 for (int image_idx = 0; image_idx < cost_map.GetDepth();
852 pcc_computer.src_image_idx = image_idx;
853 cost_map.Set(row, col, image_idx, pcc_computer.Compute());
859 struct SweepOptions {
860 float perturbation = 1.0f;
861 float depth_min = 0.0f;
862 float depth_max = 1.0f;
863 int num_samples = 15;
864 float sigma_spatial = 3.0f;
865 float sigma_color = 0.3f;
866 float ncc_sigma = 0.6f;
867 float min_triangulation_angle = 0.5f;
868 float incident_angle_sigma = 0.9f;
869 float prev_sel_prob_weight = 0.0f;
870 float geom_consistency_regularizer = 0.1f;
871 float geom_consistency_max_cost = 5.0f;
872 float filter_min_ncc = 0.1f;
873 float filter_min_triangulation_angle = 3.0f;
874 int filter_min_num_consistent = 2;
875 float filter_geom_consistency_max_cost = 1.0f;
878 template <int kWindowSize,
880 bool kGeomConsistencyTerm = false,
881 bool kFilterPhotoConsistency = false,
882 bool kFilterGeomConsistency = false>
883 __global__ void SweepFromTopToBottom(
884 GpuMat<float> global_workspace,
885 GpuMat<curandState> rand_state_map,
886 GpuMat<float> cost_map,
887 GpuMat<float> depth_map,
888 GpuMat<float> normal_map,
889 GpuMat<uint8_t> consistency_mask,
890 GpuMat<float> sel_prob_map,
891 const GpuMat<float> prev_sel_prob_map,
892 const cudaTextureObject_t ref_image_texture,
893 const GpuMat<float> ref_sum_image,
894 const GpuMat<float> ref_squared_sum_image,
895 const cudaTextureObject_t src_images_texture,
896 const cudaTextureObject_t src_depth_maps_texture,
897 const cudaTextureObject_t poses_texture,
898 const SweepOptions options) {
899 const int col = blockDim.x * blockIdx.x + threadIdx.x;
901 // Probability for boundary pixels.
902 constexpr float kUniformProb = 0.5f;
904 LikelihoodComputer likelihood_computer(options.ncc_sigma,
905 options.min_triangulation_angle,
906 options.incident_angle_sigma);
908 float* forward_message =
909 &global_workspace.GetPtr()[col * global_workspace.GetHeight()];
910 float* sampling_probs =
911 &global_workspace.GetPtr()[global_workspace.GetWidth() *
912 global_workspace.GetHeight() +
913 col * global_workspace.GetHeight()];
915 //////////////////////////////////////////////////////////////////////////////
916 // Compute backward message for all rows. Note that the backward messages
917 // are temporarily stored in the sel_prob_map and replaced row by row as the
918 // updated forward messages are computed further below.
919 //////////////////////////////////////////////////////////////////////////////
921 if (col < cost_map.GetWidth()) {
922 for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
923 // Compute backward message.
924 float beta = kUniformProb;
925 for (int row = cost_map.GetHeight() - 1; row >= 0; --row) {
926 const float cost = cost_map.Get(row, col, image_idx);
927 beta = likelihood_computer.ComputeBackwardMessage(cost, beta);
928 sel_prob_map.Set(row, col, image_idx, beta);
931 // Initialize forward message.
932 forward_message[image_idx] = kUniformProb;
936 //////////////////////////////////////////////////////////////////////////////
937 // Estimate parameters for remaining rows and compute selection
939 //////////////////////////////////////////////////////////////////////////////
941 typedef PhotoConsistencyCostComputer<kWindowSize, kWindowStep>
942 PhotoConsistencyCostComputerType;
943 PhotoConsistencyCostComputerType pcc_computer(
944 ref_image_texture, src_images_texture, poses_texture,
945 options.sigma_spatial, options.sigma_color);
946 pcc_computer.col = col;
948 __shared__ float local_ref_image_data
949 [PhotoConsistencyCostComputerType::LocalRefImageType::kDataSize];
950 pcc_computer.local_ref_image.data = &local_ref_image_data[0];
954 float normal[3] = {0};
957 // Parameters of previous pixel in column.
958 ParamState prev_param_state;
959 // Parameters of current pixel in column.
960 ParamState curr_param_state;
961 // Randomly sampled parameters.
962 ParamState rand_param_state;
963 // Cuda PRNG state for random sampling.
964 curandState rand_state;
966 if (col < cost_map.GetWidth()) {
967 // Read random state for current column.
968 rand_state = rand_state_map.Get(0, col);
969 // Parameters for first row in column.
970 prev_param_state.depth = depth_map.Get(0, col);
971 normal_map.GetSlice(0, col, prev_param_state.normal);
974 for (int row = 0; row < cost_map.GetHeight(); ++row) {
975 // Note that this must be executed even for pixels outside the borders,
976 // since pixels are used in the local neighborhood of the current pixel.
977 pcc_computer.Read(row);
979 if (col >= cost_map.GetWidth()) {
983 pcc_computer.row = row;
984 pcc_computer.local_ref_sum = ref_sum_image.Get(row, col);
985 pcc_computer.local_ref_squared_sum =
986 ref_squared_sum_image.Get(row, col);
988 // Propagate the depth at which the current ray intersects with the
989 // plane of the normal of the previous ray. This helps to better
990 // estimate the depth of very oblique structures, i.e. pixels whose
991 // normal direction is significantly different from their viewing
993 prev_param_state.depth = PropagateDepth(
994 prev_param_state.depth, prev_param_state.normal, row - 1, row);
996 // Read parameters for current pixel from previous sweep.
997 curr_param_state.depth = depth_map.Get(row, col);
998 normal_map.GetSlice(row, col, curr_param_state.normal);
1000 // Generate random parameters.
1001 rand_param_state.depth = PerturbDepth(
1002 options.perturbation, curr_param_state.depth, &rand_state);
1003 PerturbNormal(row, col, options.perturbation * M_PI,
1004 curr_param_state.normal, &rand_state,
1005 rand_param_state.normal);
1007 // Read in the backward message, compute selection probabilities and
1008 // modulate selection probabilities with priors.
1011 ComputePointAtDepth(row, col, curr_param_state.depth, point);
1013 for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1014 const float cost = cost_map.Get(row, col, image_idx);
1015 const float alpha = likelihood_computer.ComputeForwardMessage(
1016 cost, forward_message[image_idx]);
1017 const float beta = sel_prob_map.Get(row, col, image_idx);
1018 const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx);
1019 const float sel_prob = likelihood_computer.ComputeSelProb(
1020 alpha, beta, prev_prob, options.prev_sel_prob_weight);
1022 float cos_triangulation_angle;
1023 float cos_incident_angle;
1024 ComputeViewingAngles(poses_texture, point, curr_param_state.normal,
1025 image_idx, &cos_triangulation_angle,
1026 &cos_incident_angle);
1027 const float tri_prob =
1028 likelihood_computer.ComputeTriProb(cos_triangulation_angle);
1029 const float inc_prob =
1030 likelihood_computer.ComputeIncProb(cos_incident_angle);
1033 ComposeHomography(poses_texture, image_idx, row, col,
1034 curr_param_state.depth, curr_param_state.normal,
1036 const float res_prob =
1037 likelihood_computer.ComputeResolutionProb<kWindowSize>(
1040 sampling_probs[image_idx] =
1041 sel_prob * tri_prob * inc_prob * res_prob;
1044 TransformPDFToCDF(sampling_probs, cost_map.GetDepth());
1046 // Compute matching cost using Monte Carlo sampling of source images.
1047 // Images with higher selection probability are more likely to be
1048 // sampled. Hence, if only very few source images see the reference
1049 // image pixel, the same source image is likely to be sampled many
1050 // times. Instead of taking the best K probabilities, this sampling
1051 // scheme has the advantage of being adaptive to any distribution of
1052 // selection probabilities.
1054 constexpr int kNumCosts = 5;
1055 float costs[kNumCosts] = {0};
1056 const float depths[kNumCosts] = {
1057 curr_param_state.depth, prev_param_state.depth,
1058 rand_param_state.depth, curr_param_state.depth,
1059 rand_param_state.depth};
1060 const float* normals[kNumCosts] = {
1061 curr_param_state.normal, prev_param_state.normal,
1062 rand_param_state.normal, rand_param_state.normal,
1063 curr_param_state.normal};
1065 for (int sample = 0; sample < options.num_samples; ++sample) {
1066 const float rand_prob = curand_uniform(&rand_state) - FLT_EPSILON;
1068 pcc_computer.src_image_idx = -1;
1069 for (int image_idx = 0; image_idx < cost_map.GetDepth();
1071 const float prob = sampling_probs[image_idx];
1072 if (prob > rand_prob) {
1073 pcc_computer.src_image_idx = image_idx;
1078 if (pcc_computer.src_image_idx == -1) {
1082 costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx);
1083 if (kGeomConsistencyTerm) {
1084 costs[0] += options.geom_consistency_regularizer *
1085 ComputeGeomConsistencyCost(
1086 poses_texture, src_depth_maps_texture, row,
1087 col, depths[0], pcc_computer.src_image_idx,
1088 options.geom_consistency_max_cost);
1091 for (int i = 1; i < kNumCosts; ++i) {
1092 pcc_computer.depth = depths[i];
1093 pcc_computer.normal = normals[i];
1094 costs[i] += pcc_computer.Compute();
1095 if (kGeomConsistencyTerm) {
1097 options.geom_consistency_regularizer *
1098 ComputeGeomConsistencyCost(
1099 poses_texture, src_depth_maps_texture, row,
1100 col, depths[i], pcc_computer.src_image_idx,
1101 options.geom_consistency_max_cost);
1106 // Find the parameters of the minimum cost.
1107 const int min_cost_idx = FindMinCost<kNumCosts>(costs);
1108 const float best_depth = depths[min_cost_idx];
1109 const float* best_normal = normals[min_cost_idx];
1111 // Save best new parameters.
1112 depth_map.Set(row, col, best_depth);
1113 normal_map.SetSlice(row, col, best_normal);
1115 // Use the new cost to recompute the updated forward message and
1116 // the selection probability.
1117 pcc_computer.depth = best_depth;
1118 pcc_computer.normal = best_normal;
1119 for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1120 // Determine the cost for best depth.
1122 if (min_cost_idx == 0) {
1123 cost = cost_map.Get(row, col, image_idx);
1125 pcc_computer.src_image_idx = image_idx;
1126 cost = pcc_computer.Compute();
1127 cost_map.Set(row, col, image_idx, cost);
1130 const float alpha = likelihood_computer.ComputeForwardMessage(
1131 cost, forward_message[image_idx]);
1132 const float beta = sel_prob_map.Get(row, col, image_idx);
1133 const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx);
1134 const float prob = likelihood_computer.ComputeSelProb(
1135 alpha, beta, prev_prob, options.prev_sel_prob_weight);
1136 forward_message[image_idx] = alpha;
1137 sel_prob_map.Set(row, col, image_idx, prob);
1140 if (kFilterPhotoConsistency || kFilterGeomConsistency) {
1141 int num_consistent = 0;
1143 float best_point[3];
1144 ComputePointAtDepth(row, col, best_depth, best_point);
1146 const float min_ncc_prob = likelihood_computer.ComputeNCCProb(
1147 1.0f - options.filter_min_ncc);
1148 const float cos_min_triangulation_angle =
1149 cos(options.filter_min_triangulation_angle);
1151 for (int image_idx = 0; image_idx < cost_map.GetDepth();
1153 float cos_triangulation_angle;
1154 float cos_incident_angle;
1155 ComputeViewingAngles(poses_texture, best_point, best_normal,
1156 image_idx, &cos_triangulation_angle,
1157 &cos_incident_angle);
1158 if (cos_triangulation_angle > cos_min_triangulation_angle ||
1159 cos_incident_angle <= 0.0f) {
1163 if (!kFilterGeomConsistency) {
1164 if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob) {
1165 consistency_mask.Set(row, col, image_idx, 1);
1166 num_consistent += 1;
1168 } else if (!kFilterPhotoConsistency) {
1169 if (ComputeGeomConsistencyCost(
1170 poses_texture, src_depth_maps_texture, row, col,
1171 best_depth, image_idx,
1172 options.geom_consistency_max_cost) <=
1173 options.filter_geom_consistency_max_cost) {
1174 consistency_mask.Set(row, col, image_idx, 1);
1175 num_consistent += 1;
1178 if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob &&
1179 ComputeGeomConsistencyCost(
1180 poses_texture, src_depth_maps_texture, row, col,
1181 best_depth, image_idx,
1182 options.geom_consistency_max_cost) <=
1183 options.filter_geom_consistency_max_cost) {
1184 consistency_mask.Set(row, col, image_idx, 1);
1185 num_consistent += 1;
1190 if (num_consistent < options.filter_min_num_consistent) {
1191 depth_map.Set(row, col, 0.0f);
1192 normal_map.Set(row, col, 0, 0.0f);
1193 normal_map.Set(row, col, 1, 0.0f);
1194 normal_map.Set(row, col, 2, 0.0f);
1195 for (int image_idx = 0; image_idx < cost_map.GetDepth();
1197 consistency_mask.Set(row, col, image_idx, 0);
1202 // Update previous depth for next row.
1203 prev_param_state.depth = best_depth;
1204 for (int i = 0; i < 3; ++i) {
1205 prev_param_state.normal[i] = best_normal[i];
1209 if (col < cost_map.GetWidth()) {
1210 rand_state_map.Set(0, col, rand_state);
1214 PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options,
1215 const PatchMatch::Problem& problem)
1216 : options_(options),
1220 rotation_in_half_pi_(0) {
1221 SetBestCudaDevice(std::stoi(options_.gpu_index));
1225 InitWorkspaceMemory();
1228 void PatchMatchCuda::Run() {
1229 #define CASE_WINDOW_RADIUS(window_radius, window_step) \
1230 case window_radius: \
1231 RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \
1234 #define CASE_WINDOW_STEP(window_step) \
1236 switch (options_.window_radius) { \
1237 CASE_WINDOW_RADIUS(1, window_step) \
1238 CASE_WINDOW_RADIUS(2, window_step) \
1239 CASE_WINDOW_RADIUS(3, window_step) \
1240 CASE_WINDOW_RADIUS(4, window_step) \
1241 CASE_WINDOW_RADIUS(5, window_step) \
1242 CASE_WINDOW_RADIUS(6, window_step) \
1243 CASE_WINDOW_RADIUS(7, window_step) \
1244 CASE_WINDOW_RADIUS(8, window_step) \
1245 CASE_WINDOW_RADIUS(9, window_step) \
1246 CASE_WINDOW_RADIUS(10, window_step) \
1247 CASE_WINDOW_RADIUS(11, window_step) \
1248 CASE_WINDOW_RADIUS(12, window_step) \
1249 CASE_WINDOW_RADIUS(13, window_step) \
1250 CASE_WINDOW_RADIUS(14, window_step) \
1251 CASE_WINDOW_RADIUS(15, window_step) \
1252 CASE_WINDOW_RADIUS(16, window_step) \
1253 CASE_WINDOW_RADIUS(17, window_step) \
1254 CASE_WINDOW_RADIUS(18, window_step) \
1255 CASE_WINDOW_RADIUS(19, window_step) \
1256 CASE_WINDOW_RADIUS(20, window_step) \
1258 LOG(ERROR) << "Window size " << options_.window_radius \
1259 << " not supported"; \
1265 switch (options_.window_step) {
1269 LOG(ERROR) << "Window step " << options_.window_step
1270 << " not supported";
1275 #undef SWITCH_WINDOW_RADIUS
1276 #undef CALL_RUN_FUNC
1279 DepthMap PatchMatchCuda::GetDepthMap() const {
1280 return DepthMap(depth_map_->CopyToMat(), options_.depth_min,
1281 options_.depth_max);
1284 NormalMap PatchMatchCuda::GetNormalMap() const {
1285 return NormalMap(normal_map_->CopyToMat());
1288 Mat<float> PatchMatchCuda::GetSelProbMap() const {
1289 return prev_sel_prob_map_->CopyToMat();
1292 std::vector<int> PatchMatchCuda::GetConsistentImageIdxs() const {
1293 const Mat<uint8_t> mask = consistency_mask_->CopyToMat();
1294 std::vector<int> consistent_image_idxs;
1295 std::vector<int> pixel_consistent_image_idxs;
1296 pixel_consistent_image_idxs.reserve(mask.GetDepth());
1297 for (size_t r = 0; r < mask.GetHeight(); ++r) {
1298 for (size_t c = 0; c < mask.GetWidth(); ++c) {
1299 pixel_consistent_image_idxs.clear();
1300 for (size_t d = 0; d < mask.GetDepth(); ++d) {
1301 if (mask.Get(r, c, d)) {
1302 pixel_consistent_image_idxs.push_back(
1303 problem_.src_image_idxs[d]);
1306 if (pixel_consistent_image_idxs.size() > 0) {
1307 consistent_image_idxs.push_back(c);
1308 consistent_image_idxs.push_back(r);
1309 consistent_image_idxs.push_back(
1310 pixel_consistent_image_idxs.size());
1311 consistent_image_idxs.insert(
1312 consistent_image_idxs.end(),
1313 pixel_consistent_image_idxs.begin(),
1314 pixel_consistent_image_idxs.end());
1318 return consistent_image_idxs;
1321 template <int kWindowSize, int kWindowStep>
1322 void PatchMatchCuda::RunWithWindowSizeAndStep() {
1323 // Wait for all initializations to finish.
1324 CUDA_SYNC_AND_CHECK();
1326 CudaTimer total_timer;
1327 CudaTimer init_timer;
1329 ComputeCudaConfig();
1330 ComputeInitialCost<kWindowSize, kWindowStep>
1331 <<<sweep_grid_size_, sweep_block_size_>>>(
1332 *cost_map_, *depth_map_, *normal_map_,
1333 ref_image_texture_->GetObj(), *ref_image_->sum_image,
1334 *ref_image_->squared_sum_image,
1335 src_images_texture_->GetObj(), poses_texture_[0]->GetObj(),
1336 options_.sigma_spatial, options_.sigma_color);
1337 CUDA_SYNC_AND_CHECK();
1339 init_timer.Print("Initialization");
1341 const float total_num_steps = options_.num_iterations * 4;
1343 SweepOptions sweep_options;
1344 sweep_options.depth_min = options_.depth_min;
1345 sweep_options.depth_max = options_.depth_max;
1346 sweep_options.sigma_spatial = options_.sigma_spatial;
1347 sweep_options.sigma_color = options_.sigma_color;
1348 sweep_options.num_samples = options_.num_samples;
1349 sweep_options.ncc_sigma = options_.ncc_sigma;
1350 sweep_options.min_triangulation_angle =
1351 DEG2RAD(options_.min_triangulation_angle);
1352 sweep_options.incident_angle_sigma = options_.incident_angle_sigma;
1353 sweep_options.geom_consistency_regularizer =
1354 options_.geom_consistency_regularizer;
1355 sweep_options.geom_consistency_max_cost =
1356 options_.geom_consistency_max_cost;
1357 sweep_options.filter_min_ncc = options_.filter_min_ncc;
1358 sweep_options.filter_min_triangulation_angle =
1359 DEG2RAD(options_.filter_min_triangulation_angle);
1360 sweep_options.filter_min_num_consistent =
1361 options_.filter_min_num_consistent;
1362 sweep_options.filter_geom_consistency_max_cost =
1363 options_.filter_geom_consistency_max_cost;
1365 for (int iter = 0; iter < options_.num_iterations; ++iter) {
1366 CudaTimer iter_timer;
1368 for (int sweep = 0; sweep < 4; ++sweep) {
1369 CudaTimer sweep_timer;
1371 // Expenentially reduce amount of perturbation during the
1373 sweep_options.perturbation =
1374 1.0f / std::pow(2.0f, iter + sweep / 4.0f);
1376 // Linearly increase the influence of previous selection
1378 sweep_options.prev_sel_prob_weight =
1379 static_cast<float>(iter * 4 + sweep) / total_num_steps;
1381 const bool last_sweep =
1382 iter == options_.num_iterations - 1 && sweep == 3;
1384 #define CALL_SWEEP_FUNC \
1385 SweepFromTopToBottom<kWindowSize, kWindowStep, kGeomConsistencyTerm, \
1386 kFilterPhotoConsistency, kFilterGeomConsistency> \
1387 <<<sweep_grid_size_, sweep_block_size_>>>( \
1388 *global_workspace_, *rand_state_map_, *cost_map_, \
1389 *depth_map_, *normal_map_, *consistency_mask_, \
1390 *sel_prob_map_, *prev_sel_prob_map_, \
1391 ref_image_texture_->GetObj(), *ref_image_->sum_image, \
1392 *ref_image_->squared_sum_image, \
1393 src_images_texture_->GetObj(), \
1394 src_depth_maps_texture_ == nullptr \
1396 : src_depth_maps_texture_->GetObj(), \
1397 poses_texture_[rotation_in_half_pi_]->GetObj(), \
1401 if (options_.filter) {
1402 consistency_mask_.reset(new GpuMat<uint8_t>(
1403 cost_map_->GetWidth(), cost_map_->GetHeight(),
1404 cost_map_->GetDepth()));
1405 consistency_mask_->FillWithScalar(0);
1407 if (options_.geom_consistency) {
1408 const bool kGeomConsistencyTerm = true;
1409 if (options_.filter) {
1410 const bool kFilterPhotoConsistency = true;
1411 const bool kFilterGeomConsistency = true;
1414 const bool kFilterPhotoConsistency = false;
1415 const bool kFilterGeomConsistency = false;
1419 const bool kGeomConsistencyTerm = false;
1420 if (options_.filter) {
1421 const bool kFilterPhotoConsistency = true;
1422 const bool kFilterGeomConsistency = false;
1425 const bool kFilterPhotoConsistency = false;
1426 const bool kFilterGeomConsistency = false;
1431 const bool kFilterPhotoConsistency = false;
1432 const bool kFilterGeomConsistency = false;
1433 if (options_.geom_consistency) {
1434 const bool kGeomConsistencyTerm = true;
1437 const bool kGeomConsistencyTerm = false;
1442 #undef CALL_SWEEP_FUNC
1444 CUDA_SYNC_AND_CHECK();
1448 // Rotate selected image map.
1449 if (last_sweep && options_.filter) {
1450 std::unique_ptr<GpuMat<uint8_t>> rot_consistency_mask_(
1451 new GpuMat<uint8_t>(cost_map_->GetWidth(),
1452 cost_map_->GetHeight(),
1453 cost_map_->GetDepth()));
1454 consistency_mask_->Rotate(rot_consistency_mask_.get());
1455 consistency_mask_.swap(rot_consistency_mask_);
1458 sweep_timer.Print(" Sweep " + std::to_string(sweep + 1));
1461 iter_timer.Print("Iteration " + std::to_string(iter + 1));
1464 total_timer.Print("Total");
1467 void PatchMatchCuda::ComputeCudaConfig() {
1468 sweep_block_size_.x = THREADS_PER_BLOCK;
1469 sweep_block_size_.y = 1;
1470 sweep_block_size_.z = 1;
1471 sweep_grid_size_.x = (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1;
1472 sweep_grid_size_.y = 1;
1473 sweep_grid_size_.z = 1;
1475 elem_wise_block_size_.x = THREADS_PER_BLOCK;
1476 elem_wise_block_size_.y = THREADS_PER_BLOCK;
1477 elem_wise_block_size_.z = 1;
1478 elem_wise_grid_size_.x =
1479 (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1;
1480 elem_wise_grid_size_.y =
1481 (depth_map_->GetHeight() - 1) / THREADS_PER_BLOCK + 1;
1482 elem_wise_grid_size_.z = 1;
1485 void PatchMatchCuda::BindRefImageTexture() {
1486 cudaTextureDesc texture_desc;
1487 memset(&texture_desc, 0, sizeof(texture_desc));
1488 texture_desc.addressMode[0] = cudaAddressModeBorder;
1489 texture_desc.addressMode[1] = cudaAddressModeBorder;
1490 texture_desc.addressMode[2] = cudaAddressModeBorder;
1491 texture_desc.filterMode = cudaFilterModePoint;
1492 texture_desc.readMode = cudaReadModeNormalizedFloat;
1493 texture_desc.normalizedCoords = false;
1494 ref_image_texture_ = CudaArrayLayeredTexture<uint8_t>::FromGpuMat(
1495 texture_desc, *ref_image_->image);
1498 void PatchMatchCuda::InitRefImage() {
1499 const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
1501 ref_width_ = ref_image.GetWidth();
1502 ref_height_ = ref_image.GetHeight();
1504 // Upload to device and filter.
1505 ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_));
1506 const std::vector<uint8_t> ref_image_array =
1507 ref_image.GetBitmap().ConvertToRowMajorArray();
1508 ref_image_->Filter(ref_image_array.data(), options_.window_radius,
1509 options_.window_step, options_.sigma_spatial,
1510 options_.sigma_color);
1512 BindRefImageTexture();
1515 void PatchMatchCuda::InitSourceImages() {
1516 // Determine maximum image size.
1517 size_t max_width = 0;
1518 size_t max_height = 0;
1519 for (const auto image_idx : problem_.src_image_idxs) {
1520 const Image& image = problem_.images->at(image_idx);
1521 if (image.GetWidth() > max_width) {
1522 max_width = image.GetWidth();
1524 if (image.GetHeight() > max_height) {
1525 max_height = image.GetHeight();
1529 // Upload source images to device.
1531 // Copy source images to contiguous memory block.
1532 const uint8_t kDefaultValue = 0;
1533 std::vector<uint8_t> src_images_host_data(
1534 static_cast<size_t>(max_width * max_height *
1535 problem_.src_image_idxs.size()),
1537 for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) {
1538 const Image& image =
1539 problem_.images->at(problem_.src_image_idxs[i]);
1540 const Bitmap& bitmap = image.GetBitmap();
1542 src_images_host_data.data() + max_width * max_height * i;
1543 for (size_t r = 0; r < image.GetHeight(); ++r) {
1544 memcpy(dest, bitmap.GetScanline(r),
1545 image.GetWidth() * sizeof(uint8_t));
1550 // Create source images texture.
1551 cudaTextureDesc texture_desc;
1552 memset(&texture_desc, 0, sizeof(texture_desc));
1553 texture_desc.addressMode[0] = cudaAddressModeBorder;
1554 texture_desc.addressMode[1] = cudaAddressModeBorder;
1555 texture_desc.addressMode[2] = cudaAddressModeBorder;
1556 texture_desc.filterMode = cudaFilterModeLinear;
1557 texture_desc.readMode = cudaReadModeNormalizedFloat;
1558 texture_desc.normalizedCoords = false;
1559 src_images_texture_ = CudaArrayLayeredTexture<uint8_t>::FromHostArray(
1560 texture_desc, max_width, max_height,
1561 problem_.src_image_idxs.size(), src_images_host_data.data());
1564 // Upload source depth maps to device.
1565 if (options_.geom_consistency) {
1566 const float kDefaultValue = 0.0f;
1567 std::vector<float> src_depth_maps_host_data(
1568 static_cast<size_t>(max_width * max_height *
1569 problem_.src_image_idxs.size()),
1571 for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) {
1572 const DepthMap& depth_map =
1573 problem_.depth_maps->at(problem_.src_image_idxs[i]);
1574 float* dest = src_depth_maps_host_data.data() +
1575 max_width * max_height * i;
1576 for (size_t r = 0; r < depth_map.GetHeight(); ++r) {
1577 memcpy(dest, depth_map.GetPtr() + r * depth_map.GetWidth(),
1578 depth_map.GetWidth() * sizeof(float));
1583 // Create source depth maps texture.
1584 cudaTextureDesc texture_desc;
1585 memset(&texture_desc, 0, sizeof(texture_desc));
1586 texture_desc.addressMode[0] = cudaAddressModeBorder;
1587 texture_desc.addressMode[1] = cudaAddressModeBorder;
1588 texture_desc.addressMode[2] = cudaAddressModeBorder;
1589 texture_desc.filterMode = cudaFilterModePoint;
1590 texture_desc.readMode = cudaReadModeElementType;
1591 texture_desc.normalizedCoords = false;
1592 src_depth_maps_texture_ = CudaArrayLayeredTexture<float>::FromHostArray(
1593 texture_desc, max_width, max_height,
1594 problem_.src_image_idxs.size(),
1595 src_depth_maps_host_data.data());
1599 void PatchMatchCuda::InitTransforms() {
1600 const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
1602 //////////////////////////////////////////////////////////////////////////////
1603 // Generate rotated versions (counter-clockwise) of calibration matrix.
1604 //////////////////////////////////////////////////////////////////////////////
1606 for (size_t i = 0; i < 4; ++i) {
1607 ref_K_host_[i][0] = ref_image.GetK()[0];
1608 ref_K_host_[i][1] = ref_image.GetK()[2];
1609 ref_K_host_[i][2] = ref_image.GetK()[4];
1610 ref_K_host_[i][3] = ref_image.GetK()[5];
1613 // Rotated by 90 degrees.
1614 std::swap(ref_K_host_[1][0], ref_K_host_[1][2]);
1615 std::swap(ref_K_host_[1][1], ref_K_host_[1][3]);
1616 ref_K_host_[1][3] = ref_width_ - 1 - ref_K_host_[1][3];
1618 // Rotated by 180 degrees.
1619 ref_K_host_[2][1] = ref_width_ - 1 - ref_K_host_[2][1];
1620 ref_K_host_[2][3] = ref_height_ - 1 - ref_K_host_[2][3];
1622 // Rotated by 270 degrees.
1623 std::swap(ref_K_host_[3][0], ref_K_host_[3][2]);
1624 std::swap(ref_K_host_[3][1], ref_K_host_[3][3]);
1625 ref_K_host_[3][1] = ref_height_ - 1 - ref_K_host_[3][1];
1627 // Extract 1/fx, -cx/fx, fy, -cy/fy.
1628 for (size_t i = 0; i < 4; ++i) {
1629 ref_inv_K_host_[i][0] = 1.0f / ref_K_host_[i][0];
1630 ref_inv_K_host_[i][1] = -ref_K_host_[i][1] / ref_K_host_[i][0];
1631 ref_inv_K_host_[i][2] = 1.0f / ref_K_host_[i][2];
1632 ref_inv_K_host_[i][3] = -ref_K_host_[i][3] / ref_K_host_[i][2];
1635 // Bind 0 degrees version to constant global memory.
1636 CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[0], sizeof(float) * 4,
1637 0, cudaMemcpyHostToDevice));
1638 CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[0],
1639 sizeof(float) * 4, 0,
1640 cudaMemcpyHostToDevice));
1642 //////////////////////////////////////////////////////////////////////////////
1643 // Generate rotated versions of camera poses.
1644 //////////////////////////////////////////////////////////////////////////////
1647 memcpy(rotated_R, ref_image.GetR(), 9 * sizeof(float));
1650 memcpy(rotated_T, ref_image.GetT(), 3 * sizeof(float));
1652 // Matrix for 90deg rotation around Z-axis in counter-clockwise direction.
1653 const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1};
1655 cudaTextureDesc texture_desc;
1656 memset(&texture_desc, 0, sizeof(texture_desc));
1657 texture_desc.addressMode[0] = cudaAddressModeBorder;
1658 texture_desc.addressMode[1] = cudaAddressModeBorder;
1659 texture_desc.addressMode[2] = cudaAddressModeBorder;
1660 texture_desc.filterMode = cudaFilterModePoint;
1661 texture_desc.readMode = cudaReadModeElementType;
1662 texture_desc.normalizedCoords = false;
1664 for (size_t i = 0; i < 4; ++i) {
1665 const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12;
1666 std::vector<float> poses_host_data(kNumTformParams *
1667 problem_.src_image_idxs.size());
1669 for (const auto image_idx : problem_.src_image_idxs) {
1670 const Image& image = problem_.images->at(image_idx);
1672 const float K[4] = {image.GetK()[0], image.GetK()[2],
1673 image.GetK()[4], image.GetK()[5]};
1674 memcpy(poses_host_data.data() + offset, K, 4 * sizeof(float));
1679 ComputeRelativePose(rotated_R, rotated_T, image.GetR(),
1680 image.GetT(), rel_R, rel_T);
1681 memcpy(poses_host_data.data() + offset, rel_R, 9 * sizeof(float));
1683 memcpy(poses_host_data.data() + offset, rel_T, 3 * sizeof(float));
1687 ComputeProjectionCenter(rel_R, rel_T, C);
1688 memcpy(poses_host_data.data() + offset, C, 3 * sizeof(float));
1692 ComposeProjectionMatrix(image.GetK(), rel_R, rel_T, P);
1693 memcpy(poses_host_data.data() + offset, P, 12 * sizeof(float));
1697 ComposeInverseProjectionMatrix(image.GetK(), rel_R, rel_T, inv_P);
1698 memcpy(poses_host_data.data() + offset, inv_P, 12 * sizeof(float));
1702 poses_texture_[i] = CudaArrayLayeredTexture<float>::FromHostArray(
1703 texture_desc, kNumTformParams, problem_.src_image_idxs.size(),
1704 1, poses_host_data.data());
1706 RotatePose(R_z90, rotated_R, rotated_T);
1710 void PatchMatchCuda::InitWorkspaceMemory() {
1711 rand_state_map_.reset(new GpuMatPRNG(ref_width_, ref_height_));
1713 depth_map_.reset(new GpuMat<float>(ref_width_, ref_height_));
1714 if (options_.geom_consistency) {
1715 const DepthMap& init_depth_map =
1716 problem_.depth_maps->at(problem_.ref_image_idx);
1717 depth_map_->CopyToDevice(init_depth_map.GetPtr(),
1718 init_depth_map.GetWidth() * sizeof(float));
1720 depth_map_->FillWithRandomNumbers(options_.depth_min,
1721 options_.depth_max, *rand_state_map_);
1724 normal_map_.reset(new GpuMat<float>(ref_width_, ref_height_, 3));
1726 // Note that it is not necessary to keep the selection probability map in
1727 // memory for all pixels. Theoretically, it is possible to incorporate
1728 // the temporary selection probabilities in the global_workspace_.
1729 // However, it is useful to keep the probabilities for the entire image
1730 // in memory, so that it can be exported.
1731 sel_prob_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1732 problem_.src_image_idxs.size()));
1733 prev_sel_prob_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1734 problem_.src_image_idxs.size()));
1735 prev_sel_prob_map_->FillWithScalar(0.5f);
1737 cost_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1738 problem_.src_image_idxs.size()));
1740 const int ref_max_dim = std::max(ref_width_, ref_height_);
1741 global_workspace_.reset(
1742 new GpuMat<float>(ref_max_dim, problem_.src_image_idxs.size(), 2));
1744 consistency_mask_.reset(new GpuMat<uint8_t>(0, 0, 0));
1746 ComputeCudaConfig();
1748 if (options_.geom_consistency) {
1749 const NormalMap& init_normal_map =
1750 problem_.normal_maps->at(problem_.ref_image_idx);
1751 normal_map_->CopyToDevice(init_normal_map.GetPtr(),
1752 init_normal_map.GetWidth() * sizeof(float));
1754 InitNormalMap<<<elem_wise_grid_size_, elem_wise_block_size_>>>(
1755 *normal_map_, *rand_state_map_);
1759 void PatchMatchCuda::Rotate() {
1760 rotation_in_half_pi_ = (rotation_in_half_pi_ + 1) % 4;
1764 if (rotation_in_half_pi_ % 2 == 0) {
1766 height = ref_height_;
1768 width = ref_height_;
1769 height = ref_width_;
1772 // Rotate random map.
1774 std::unique_ptr<GpuMatPRNG> rotated_rand_state_map(
1775 new GpuMatPRNG(width, height));
1776 rand_state_map_->Rotate(rotated_rand_state_map.get());
1777 rand_state_map_.swap(rotated_rand_state_map);
1780 // Rotate depth map.
1782 std::unique_ptr<GpuMat<float>> rotated_depth_map(
1783 new GpuMat<float>(width, height));
1784 depth_map_->Rotate(rotated_depth_map.get());
1785 depth_map_.swap(rotated_depth_map);
1788 // Rotate normal map.
1790 RotateNormalMap<<<elem_wise_grid_size_, elem_wise_block_size_>>>(
1792 std::unique_ptr<GpuMat<float>> rotated_normal_map(
1793 new GpuMat<float>(width, height, 3));
1794 normal_map_->Rotate(rotated_normal_map.get());
1795 normal_map_.swap(rotated_normal_map);
1798 // Rotate reference image.
1800 std::unique_ptr<GpuMatRefImage> rotated_ref_image(
1801 new GpuMatRefImage(width, height));
1802 ref_image_->image->Rotate(rotated_ref_image->image.get());
1803 ref_image_->sum_image->Rotate(rotated_ref_image->sum_image.get());
1804 ref_image_->squared_sum_image->Rotate(
1805 rotated_ref_image->squared_sum_image.get());
1806 ref_image_.swap(rotated_ref_image);
1807 BindRefImageTexture();
1810 // Rotate selection probability map.
1811 prev_sel_prob_map_.reset(
1812 new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1813 sel_prob_map_->Rotate(prev_sel_prob_map_.get());
1814 sel_prob_map_.reset(
1815 new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1819 std::unique_ptr<GpuMat<float>> rotated_cost_map(new GpuMat<float>(
1820 width, height, problem_.src_image_idxs.size()));
1821 cost_map_->Rotate(rotated_cost_map.get());
1822 cost_map_.swap(rotated_cost_map);
1825 // Rotate calibration.
1826 CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_],
1827 sizeof(float) * 4, 0,
1828 cudaMemcpyHostToDevice));
1830 cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[rotation_in_half_pi_],
1831 sizeof(float) * 4, 0, cudaMemcpyHostToDevice));
1833 // Recompute Cuda configuration for rotated reference image.
1834 ComputeCudaConfig();
1838 } // namespace colmap