ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
patch_match_cuda.cu
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #define _USE_MATH_DEFINES
9 
10 #include <algorithm>
11 #include <cfloat>
12 #include <cmath>
13 #include <cstdint>
14 #include <sstream>
15 
16 #include "mvs/patch_match_cuda.h"
17 #include "util/cuda.h"
18 #include "util/cudacc.h"
19 #include "util/logging.h"
20 
21 // The number of threads per Cuda thread. Warning: Do not change this value,
22 // since the templated window sizes rely on this value.
23 #define THREADS_PER_BLOCK 32
24 
25 // We must not include "util/math.h" to avoid any Eigen includes here,
26 // since Visual Studio cannot compile some of the Eigen/Boost expressions.
27 #ifndef DEG2RAD
28 #define DEG2RAD(deg) deg * 0.0174532925199432
29 #endif
30 
31 namespace colmap {
32 namespace mvs {
33 
34 // Calibration of reference image as {fx, cx, fy, cy}.
35 __constant__ float ref_K[4];
36 // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}.
37 __constant__ float ref_inv_K[4];
38 
39 __device__ inline void Mat33DotVec3(const float mat[9],
40  const float vec[3],
41  float result[3]) {
42  result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2];
43  result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2];
44  result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2];
45 }
46 
47 __device__ inline void Mat33DotVec3Homogeneous(const float mat[9],
48  const float vec[2],
49  float result[2]) {
50  const float inv_z = 1.0f / (mat[6] * vec[0] + mat[7] * vec[1] + mat[8]);
51  result[0] = inv_z * (mat[0] * vec[0] + mat[1] * vec[1] + mat[2]);
52  result[1] = inv_z * (mat[3] * vec[0] + mat[4] * vec[1] + mat[5]);
53 }
54 
55 __device__ inline float DotProduct3(const float vec1[3], const float vec2[3]) {
56  return vec1[0] * vec2[0] + vec1[1] * vec2[1] + vec1[2] * vec2[2];
57 }
58 
59 __device__ inline float GenerateRandomDepth(const float depth_min,
60  const float depth_max,
61  curandState* rand_state) {
62  return curand_uniform(rand_state) * (depth_max - depth_min) + depth_min;
63 }
64 
65 __device__ inline void GenerateRandomNormal(const int row,
66  const int col,
67  curandState* rand_state,
68  float normal[3]) {
69  // Unbiased sampling of normal, according to George Marsaglia, "Choosing a
70  // Point from the Surface of a Sphere", 1972.
71  float v1 = 0.0f;
72  float v2 = 0.0f;
73  float s = 2.0f;
74  while (s >= 1.0f) {
75  v1 = 2.0f * curand_uniform(rand_state) - 1.0f;
76  v2 = 2.0f * curand_uniform(rand_state) - 1.0f;
77  s = v1 * v1 + v2 * v2;
78  }
79 
80  const float s_norm = sqrt(1.0f - s);
81  normal[0] = 2.0f * v1 * s_norm;
82  normal[1] = 2.0f * v2 * s_norm;
83  normal[2] = 1.0f - 2.0f * s;
84 
85  // Make sure normal is looking away from camera.
86  const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1],
87  ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
88  if (DotProduct3(normal, view_ray) > 0) {
89  normal[0] = -normal[0];
90  normal[1] = -normal[1];
91  normal[2] = -normal[2];
92  }
93 }
94 
95 __device__ inline float PerturbDepth(const float perturbation,
96  const float depth,
97  curandState* rand_state) {
98  const float depth_min = (1.0f - perturbation) * depth;
99  const float depth_max = (1.0f + perturbation) * depth;
100  return GenerateRandomDepth(depth_min, depth_max, rand_state);
101 }
102 
103 __device__ inline void PerturbNormal(const int row,
104  const int col,
105  const float perturbation,
106  const float normal[3],
107  curandState* rand_state,
108  float perturbed_normal[3],
109  const int num_trials = 0) {
110  // Perturbation rotation angles.
111  const float a1 = (curand_uniform(rand_state) - 0.5f) * perturbation;
112  const float a2 = (curand_uniform(rand_state) - 0.5f) * perturbation;
113  const float a3 = (curand_uniform(rand_state) - 0.5f) * perturbation;
114 
115  const float sin_a1 = sin(a1);
116  const float sin_a2 = sin(a2);
117  const float sin_a3 = sin(a3);
118  const float cos_a1 = cos(a1);
119  const float cos_a2 = cos(a2);
120  const float cos_a3 = cos(a3);
121 
122  // R = Rx * Ry * Rz
123  float R[9];
124  R[0] = cos_a2 * cos_a3;
125  R[1] = -cos_a2 * sin_a3;
126  R[2] = sin_a2;
127  R[3] = cos_a1 * sin_a3 + cos_a3 * sin_a1 * sin_a2;
128  R[4] = cos_a1 * cos_a3 - sin_a1 * sin_a2 * sin_a3;
129  R[5] = -cos_a2 * sin_a1;
130  R[6] = sin_a1 * sin_a3 - cos_a1 * cos_a3 * sin_a2;
131  R[7] = cos_a3 * sin_a1 + cos_a1 * sin_a2 * sin_a3;
132  R[8] = cos_a1 * cos_a2;
133 
134  // Perturb the normal vector.
135  Mat33DotVec3(R, normal, perturbed_normal);
136 
137  // Make sure the perturbed normal is still looking in the same direction as
138  // the viewing direction, otherwise try again but with smaller perturbation.
139  const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1],
140  ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
141  if (DotProduct3(perturbed_normal, view_ray) >= 0.0f) {
142  const int kMaxNumTrials = 3;
143  if (num_trials < kMaxNumTrials) {
144  PerturbNormal(row, col, 0.5f * perturbation, normal, rand_state,
145  perturbed_normal, num_trials + 1);
146  return;
147  } else {
148  perturbed_normal[0] = normal[0];
149  perturbed_normal[1] = normal[1];
150  perturbed_normal[2] = normal[2];
151  return;
152  }
153  }
154 
155  // Make sure normal has unit norm.
156  const float inv_norm =
157  rsqrt(DotProduct3(perturbed_normal, perturbed_normal));
158  perturbed_normal[0] *= inv_norm;
159  perturbed_normal[1] *= inv_norm;
160  perturbed_normal[2] *= inv_norm;
161 }
162 
163 __device__ inline void ComputePointAtDepth(const float row,
164  const float col,
165  const float depth,
166  float point[3]) {
167  point[0] = depth * (ref_inv_K[0] * col + ref_inv_K[1]);
168  point[1] = depth * (ref_inv_K[2] * row + ref_inv_K[3]);
169  point[2] = depth;
170 }
171 
172 // Transfer depth on plane from viewing ray at row1 to row2. The returned
173 // depth is the intersection of the viewing ray through row2 with the plane
174 // at row1 defined by the given depth and normal.
175 __device__ inline float PropagateDepth(const float depth1,
176  const float normal1[3],
177  const float row1,
178  const float row2) {
179  // Point along first viewing ray.
180  const float x1 = depth1 * (ref_inv_K[2] * row1 + ref_inv_K[3]);
181  const float y1 = depth1;
182  // Point on plane defined by point along first viewing ray and plane
183  // normal1.
184  const float x2 = x1 + normal1[2];
185  const float y2 = y1 - normal1[1];
186 
187  // Origin of second viewing ray.
188  // const float x3 = 0.0f;
189  // const float y3 = 0.0f;
190  // Point on second viewing ray.
191  const float x4 = ref_inv_K[2] * row2 + ref_inv_K[3];
192  // const float y4 = 1.0f;
193 
194  // Intersection of the lines ((x1, y1), (x2, y2)) and ((x3, y3), (x4, y4)).
195  const float denom = x2 - x1 + x4 * (y1 - y2);
196  constexpr float kEps = 1e-5f;
197  if (abs(denom) < kEps) {
198  return depth1;
199  }
200  const float nom = y1 * x2 - x1 * y2;
201  return nom / denom;
202 }
203 
204 // First, compute triangulation angle between reference and source image for 3D
205 // point. Second, compute incident angle between viewing direction of source
206 // image and normal direction of 3D point. Both angles are cosine distances.
207 __device__ inline void ComputeViewingAngles(
208  const cudaTextureObject_t poses_texture,
209  const float point[3],
210  const float normal[3],
211  const int image_idx,
212  float* cos_triangulation_angle,
213  float* cos_incident_angle) {
214  *cos_triangulation_angle = 0.0f;
215  *cos_incident_angle = 0.0f;
216 
217  // Projection center of source image.
218  float C[3];
219  for (int i = 0; i < 3; ++i) {
220  C[i] = tex2D<float>(poses_texture, i + 16, image_idx);
221  }
222 
223  // Ray from point to camera.
224  const float SX[3] = {C[0] - point[0], C[1] - point[1], C[2] - point[2]};
225 
226  // Length of ray from reference image to point.
227  const float RX_inv_norm = rsqrt(DotProduct3(point, point));
228 
229  // Length of ray from source image to point.
230  const float SX_inv_norm = rsqrt(DotProduct3(SX, SX));
231 
232  *cos_incident_angle = DotProduct3(SX, normal) * SX_inv_norm;
233  *cos_triangulation_angle =
234  DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm;
235 }
236 
237 __device__ inline void ComposeHomography(
238  const cudaTextureObject_t poses_texture,
239  const int image_idx,
240  const int row,
241  const int col,
242  const float depth,
243  const float normal[3],
244  float H[9]) {
245  // Calibration of source image.
246  float K[4];
247  for (int i = 0; i < 4; ++i) {
248  K[i] = tex2D<float>(poses_texture, i, image_idx);
249  }
250 
251  // Relative rotation between reference and source image.
252  float R[9];
253  for (int i = 0; i < 9; ++i) {
254  R[i] = tex2D<float>(poses_texture, i + 4, image_idx);
255  }
256 
257  // Relative translation between reference and source image.
258  float T[3];
259  for (int i = 0; i < 3; ++i) {
260  T[i] = tex2D<float>(poses_texture, i + 13, image_idx);
261  }
262 
263  // Distance to the plane.
264  const float dist =
265  depth *
266  (normal[0] * (ref_inv_K[0] * col + ref_inv_K[1]) +
267  normal[1] * (ref_inv_K[2] * row + ref_inv_K[3]) + normal[2]);
268  const float inv_dist = 1.0f / dist;
269 
270  const float inv_dist_N0 = inv_dist * normal[0];
271  const float inv_dist_N1 = inv_dist * normal[1];
272  const float inv_dist_N2 = inv_dist * normal[2];
273 
274  // Homography as H = K * (R - T * n' / d) * Kref^-1.
275  H[0] = ref_inv_K[0] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
276  K[1] * (R[6] + inv_dist_N0 * T[2]));
277  H[1] = ref_inv_K[2] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
278  K[1] * (R[7] + inv_dist_N1 * T[2]));
279  H[2] = K[0] * (R[2] + inv_dist_N2 * T[0]) +
280  K[1] * (R[8] + inv_dist_N2 * T[2]) +
281  ref_inv_K[1] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
282  K[1] * (R[6] + inv_dist_N0 * T[2])) +
283  ref_inv_K[3] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
284  K[1] * (R[7] + inv_dist_N1 * T[2]));
285  H[3] = ref_inv_K[0] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
286  K[3] * (R[6] + inv_dist_N0 * T[2]));
287  H[4] = ref_inv_K[2] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
288  K[3] * (R[7] + inv_dist_N1 * T[2]));
289  H[5] = K[2] * (R[5] + inv_dist_N2 * T[1]) +
290  K[3] * (R[8] + inv_dist_N2 * T[2]) +
291  ref_inv_K[1] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
292  K[3] * (R[6] + inv_dist_N0 * T[2])) +
293  ref_inv_K[3] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
294  K[3] * (R[7] + inv_dist_N1 * T[2]));
295  H[6] = ref_inv_K[0] * (R[6] + inv_dist_N0 * T[2]);
296  H[7] = ref_inv_K[2] * (R[7] + inv_dist_N1 * T[2]);
297  H[8] = R[8] + ref_inv_K[1] * (R[6] + inv_dist_N0 * T[2]) +
298  ref_inv_K[3] * (R[7] + inv_dist_N1 * T[2]) + inv_dist_N2 * T[2];
299 }
300 
301 // Each thread in the current warp / thread block reads in 3 columns of the
302 // reference image. The shared memory holds 3 * THREADS_PER_BLOCK columns and
303 // kWindowSize rows of the reference image. Each thread copies every
304 // THREADS_PER_BLOCK-th column from global to shared memory offset by its ID.
305 // For example, if THREADS_PER_BLOCK = 32, then thread 0 reads columns 0, 32, 64
306 // and thread 1 columns 1, 33, 65. When computing the photoconsistency, which is
307 // shared among each thread block, each thread can then read the reference image
308 // colors from shared memory. Note that this limits the window radius to a
309 // maximum of THREADS_PER_BLOCK.
310 template <int kWindowSize>
311 struct LocalRefImage {
312  const static int kWindowRadius = kWindowSize / 2;
313  const static int kThreadBlockRadius = 1;
314  const static int kThreadBlockSize = 2 * kThreadBlockRadius + 1;
315  const static int kNumRows = kWindowSize;
316  const static int kNumColumns = kThreadBlockSize * THREADS_PER_BLOCK;
317  const static int kDataSize = kNumRows * kNumColumns;
318 
319  __device__ explicit LocalRefImage(
320  const cudaTextureObject_t ref_image_texture)
321  : ref_image_texture_(ref_image_texture) {}
322 
323  float* data = nullptr;
324 
325  __device__ inline void Read(const int row) {
326  // For the first row, read the entire block into shared memory. For all
327  // consecutive rows, it is only necessary to shift the rows in shared
328  // memory up by one element and then read in a new row at the bottom of
329  // the shared memory. Note that this assumes that the calling loop
330  // starts with the first row and then consecutively reads in the next
331  // row.
332 
333  const int thread_id = threadIdx.x;
334  const int thread_block_first_id = blockDim.x * blockIdx.x;
335 
336  const int local_col_start = thread_id;
337  const int global_col_start = thread_block_first_id -
338  kThreadBlockRadius * THREADS_PER_BLOCK +
339  thread_id;
340 
341  if (row == 0) {
342  int global_row = row - kWindowRadius;
343  for (int local_row = 0; local_row < kNumRows;
344  ++local_row, ++global_row) {
345  int local_col = local_col_start;
346  int global_col = global_col_start;
347 #pragma unroll
348  for (int block = 0; block < kThreadBlockSize; ++block) {
349  data[local_row * kNumColumns + local_col] = tex2D<float>(
350  ref_image_texture_, global_col, global_row);
351  local_col += THREADS_PER_BLOCK;
352  global_col += THREADS_PER_BLOCK;
353  }
354  }
355  } else {
356  // Move rows in shared memory up by one row.
357  for (int local_row = 1; local_row < kNumRows; ++local_row) {
358  int local_col = local_col_start;
359 #pragma unroll
360  for (int block = 0; block < kThreadBlockSize; ++block) {
361  data[(local_row - 1) * kNumColumns + local_col] =
362  data[local_row * kNumColumns + local_col];
363  local_col += THREADS_PER_BLOCK;
364  }
365  }
366 
367  // Read next row into the last row of shared memory.
368  const int local_row = kNumRows - 1;
369  const int global_row = row + kWindowRadius;
370  int local_col = local_col_start;
371  int global_col = global_col_start;
372 #pragma unroll
373  for (int block = 0; block < kThreadBlockSize; ++block) {
374  data[local_row * kNumColumns + local_col] = tex2D<float>(
375  ref_image_texture_, global_col, global_row);
376  local_col += THREADS_PER_BLOCK;
377  global_col += THREADS_PER_BLOCK;
378  }
379  }
380  }
381 
382 private:
383  const cudaTextureObject_t ref_image_texture_;
384 };
385 
386 // The return values is 1 - NCC, so the range is [0, 2], the smaller the
387 // value, the better the color consistency.
388 template <int kWindowSize, int kWindowStep>
389 struct PhotoConsistencyCostComputer {
390  const static int kWindowRadius = kWindowSize / 2;
391 
392  __device__ PhotoConsistencyCostComputer(
393  const cudaTextureObject_t ref_image_texture,
394  const cudaTextureObject_t src_images_texture,
395  const cudaTextureObject_t poses_texture,
396  const float sigma_spatial,
397  const float sigma_color)
398  : local_ref_image(ref_image_texture),
399  src_images_texture_(src_images_texture),
400  poses_texture_(poses_texture),
401  bilateral_weight_computer_(sigma_spatial, sigma_color) {}
402 
403  // Maximum photo consistency cost as 1 - min(NCC).
404  const float kMaxCost = 2.0f;
405 
406  // Thread warp local reference image data around current patch.
407  typedef LocalRefImage<kWindowSize> LocalRefImageType;
408  LocalRefImageType local_ref_image;
409 
410  // Precomputed sum of raw and squared image intensities.
411  float local_ref_sum = 0.0f;
412  float local_ref_squared_sum = 0.0f;
413 
414  // Index of source image.
415  int src_image_idx = -1;
416 
417  // Center position of patch in reference image.
418  int row = -1;
419  int col = -1;
420 
421  // Depth and normal for which to warp patch.
422  float depth = 0.0f;
423  const float* normal = nullptr;
424 
425  __device__ inline void Read(const int row) {
426  local_ref_image.Read(row);
427  __syncthreads();
428  }
429 
430  __device__ inline float Compute() const {
431  float tform[9];
432  ComposeHomography(poses_texture_, src_image_idx, row, col, depth,
433  normal, tform);
434 
435  float tform_step[8];
436  for (int i = 0; i < 8; ++i) {
437  tform_step[i] = kWindowStep * tform[i];
438  }
439 
440  const int thread_id = threadIdx.x;
441  const int row_start = row - kWindowRadius;
442  const int col_start = col - kWindowRadius;
443 
444  float col_src = tform[0] * col_start + tform[1] * row_start + tform[2];
445  float row_src = tform[3] * col_start + tform[4] * row_start + tform[5];
446  float z = tform[6] * col_start + tform[7] * row_start + tform[8];
447  float base_col_src = col_src;
448  float base_row_src = row_src;
449  float base_z = z;
450 
451  int ref_image_idx = THREADS_PER_BLOCK - kWindowRadius + thread_id;
452  int ref_image_base_idx = ref_image_idx;
453 
454  const float ref_center_color =
455  local_ref_image.data[ref_image_idx +
456  kWindowRadius * 3 * THREADS_PER_BLOCK +
457  kWindowRadius];
458  const float ref_color_sum = local_ref_sum;
459  const float ref_color_squared_sum = local_ref_squared_sum;
460  float src_color_sum = 0.0f;
461  float src_color_squared_sum = 0.0f;
462  float src_ref_color_sum = 0.0f;
463  float bilateral_weight_sum = 0.0f;
464 
465  for (int row = -kWindowRadius; row <= kWindowRadius;
466  row += kWindowStep) {
467  for (int col = -kWindowRadius; col <= kWindowRadius;
468  col += kWindowStep) {
469  const float inv_z = 1.0f / z;
470  const float norm_col_src = inv_z * col_src + 0.5f;
471  const float norm_row_src = inv_z * row_src + 0.5f;
472  const float ref_color = local_ref_image.data[ref_image_idx];
473  const float src_color =
474  tex2DLayered<float>(src_images_texture_, norm_col_src,
475  norm_row_src, src_image_idx);
476 
477  const float bilateral_weight =
478  bilateral_weight_computer_.Compute(
479  row, col, ref_center_color, ref_color);
480 
481  const float bilateral_weight_src = bilateral_weight * src_color;
482 
483  src_color_sum += bilateral_weight_src;
484  src_color_squared_sum += bilateral_weight_src * src_color;
485  src_ref_color_sum += bilateral_weight_src * ref_color;
486  bilateral_weight_sum += bilateral_weight;
487 
488  ref_image_idx += kWindowStep;
489 
490  // Accumulate warped source coordinates per row to reduce
491  // numerical errors. Note that this is necessary since
492  // coordinates usually are in the order of 1000s as opposed to
493  // the color values which are normalized to the range [0, 1].
494  col_src += tform_step[0];
495  row_src += tform_step[3];
496  z += tform_step[6];
497  }
498 
499  ref_image_base_idx += kWindowStep * 3 * THREADS_PER_BLOCK;
500  ref_image_idx = ref_image_base_idx;
501 
502  base_col_src += tform_step[1];
503  base_row_src += tform_step[4];
504  base_z += tform_step[7];
505 
506  col_src = base_col_src;
507  row_src = base_row_src;
508  z = base_z;
509  }
510 
511  const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum;
512  src_color_sum *= inv_bilateral_weight_sum;
513  src_color_squared_sum *= inv_bilateral_weight_sum;
514  src_ref_color_sum *= inv_bilateral_weight_sum;
515 
516  const float ref_color_var =
517  ref_color_squared_sum - ref_color_sum * ref_color_sum;
518  const float src_color_var =
519  src_color_squared_sum - src_color_sum * src_color_sum;
520 
521  // Based on Jensen's Inequality for convex functions, the variance
522  // should always be larger than 0. Do not make this threshold smaller.
523  constexpr float kMinVar = 1e-5f;
524  if (ref_color_var < kMinVar || src_color_var < kMinVar) {
525  return kMaxCost;
526  } else {
527  const float src_ref_color_covar =
528  src_ref_color_sum - ref_color_sum * src_color_sum;
529  const float src_ref_color_var = sqrt(ref_color_var * src_color_var);
530  return max(0.0f, min(kMaxCost, 1.0f - src_ref_color_covar /
531  src_ref_color_var));
532  }
533  }
534 
535 private:
536  const cudaTextureObject_t src_images_texture_;
537  const cudaTextureObject_t poses_texture_;
538  const BilateralWeightComputer bilateral_weight_computer_;
539 };
540 
541 __device__ inline float ComputeGeomConsistencyCost(
542  const cudaTextureObject_t poses_texture,
543  const cudaTextureObject_t src_depth_maps_texture,
544  const float row,
545  const float col,
546  const float depth,
547  const int image_idx,
548  const float max_cost) {
549  // Extract projection matrices for source image.
550  float P[12];
551  for (int i = 0; i < 12; ++i) {
552  P[i] = tex2D<float>(poses_texture, i + 19, image_idx);
553  }
554  float inv_P[12];
555  for (int i = 0; i < 12; ++i) {
556  inv_P[i] = tex2D<float>(poses_texture, i + 31, image_idx);
557  }
558 
559  // Project point in reference image to world.
560  float forward_point[3];
561  ComputePointAtDepth(row, col, depth, forward_point);
562 
563  // Project world point to source image.
564  const float inv_forward_z =
565  1.0f / (P[8] * forward_point[0] + P[9] * forward_point[1] +
566  P[10] * forward_point[2] + P[11]);
567  float src_col =
568  inv_forward_z * (P[0] * forward_point[0] + P[1] * forward_point[1] +
569  P[2] * forward_point[2] + P[3]);
570  float src_row =
571  inv_forward_z * (P[4] * forward_point[0] + P[5] * forward_point[1] +
572  P[6] * forward_point[2] + P[7]);
573 
574  // Extract depth in source image.
575  const float src_depth = tex2DLayered<float>(
576  src_depth_maps_texture, src_col + 0.5f, src_row + 0.5f, image_idx);
577 
578  // Projection outside of source image.
579  if (src_depth == 0.0f) {
580  return max_cost;
581  }
582 
583  // Project point in source image to world.
584  src_col *= src_depth;
585  src_row *= src_depth;
586  const float backward_point_x = inv_P[0] * src_col + inv_P[1] * src_row +
587  inv_P[2] * src_depth + inv_P[3];
588  const float backward_point_y = inv_P[4] * src_col + inv_P[5] * src_row +
589  inv_P[6] * src_depth + inv_P[7];
590  const float backward_point_z = inv_P[8] * src_col + inv_P[9] * src_row +
591  inv_P[10] * src_depth + inv_P[11];
592  const float inv_backward_point_z = 1.0f / backward_point_z;
593 
594  // Project world point back to reference image.
595  const float backward_col =
596  inv_backward_point_z *
597  (ref_K[0] * backward_point_x + ref_K[1] * backward_point_z);
598  const float backward_row =
599  inv_backward_point_z *
600  (ref_K[2] * backward_point_y + ref_K[3] * backward_point_z);
601 
602  // Return truncated reprojection error between original observation and
603  // the forward-backward projected observation.
604  const float diff_col = col - backward_col;
605  const float diff_row = row - backward_row;
606  return min(max_cost, sqrt(diff_col * diff_col + diff_row * diff_row));
607 }
608 
609 // Find index of minimum in given values.
610 template <int kNumCosts>
611 __device__ inline int FindMinCost(const float costs[kNumCosts]) {
612  float min_cost = costs[0];
613  int min_cost_idx = 0;
614  for (int idx = 1; idx < kNumCosts; ++idx) {
615  if (costs[idx] <= min_cost) {
616  min_cost = costs[idx];
617  min_cost_idx = idx;
618  }
619  }
620  return min_cost_idx;
621 }
622 
623 __device__ inline void TransformPDFToCDF(float* probs, const int num_probs) {
624  float prob_sum = 0.0f;
625  for (int i = 0; i < num_probs; ++i) {
626  prob_sum += probs[i];
627  }
628  const float inv_prob_sum = 1.0f / prob_sum;
629 
630  float cum_prob = 0.0f;
631  for (int i = 0; i < num_probs; ++i) {
632  const float prob = probs[i] * inv_prob_sum;
633  cum_prob += prob;
634  probs[i] = cum_prob;
635  }
636 }
637 
638 class LikelihoodComputer {
639 public:
640  __device__ LikelihoodComputer(const float ncc_sigma,
641  const float min_triangulation_angle,
642  const float incident_angle_sigma)
643  : cos_min_triangulation_angle_(cos(min_triangulation_angle)),
644  inv_incident_angle_sigma_square_(
645  -0.5f / (incident_angle_sigma * incident_angle_sigma)),
646  inv_ncc_sigma_square_(-0.5f / (ncc_sigma * ncc_sigma)),
647  ncc_norm_factor_(ComputeNCCCostNormFactor(ncc_sigma)) {}
648 
649  // Compute forward message from current cost and forward message of
650  // previous / neighboring pixel.
651  __device__ float ComputeForwardMessage(const float cost,
652  const float prev) const {
653  return ComputeMessage<true>(cost, prev);
654  }
655 
656  // Compute backward message from current cost and backward message of
657  // previous / neighboring pixel.
658  __device__ float ComputeBackwardMessage(const float cost,
659  const float prev) const {
660  return ComputeMessage<false>(cost, prev);
661  }
662 
663  // Compute the selection probability from the forward and backward message.
664  __device__ inline float ComputeSelProb(const float alpha,
665  const float beta,
666  const float prev,
667  const float prev_weight) const {
668  const float zn0 = (1.0f - alpha) * (1.0f - beta);
669  const float zn1 = alpha * beta;
670  const float curr = zn1 / (zn0 + zn1);
671  return prev_weight * prev + (1.0f - prev_weight) * curr;
672  }
673 
674  // Compute NCC probability. Note that cost = 1 - NCC.
675  __device__ inline float ComputeNCCProb(const float cost) const {
676  return exp(cost * cost * inv_ncc_sigma_square_) * ncc_norm_factor_;
677  }
678 
679  // Compute the triangulation angle probability.
680  __device__ inline float ComputeTriProb(
681  const float cos_triangulation_angle) const {
682  const float abs_cos_triangulation_angle = abs(cos_triangulation_angle);
683  if (abs_cos_triangulation_angle > cos_min_triangulation_angle_) {
684  const float scaled =
685  1.0f - (1.0f - abs_cos_triangulation_angle) /
686  (1.0f - cos_min_triangulation_angle_);
687  const float likelihood = 1.0f - scaled * scaled;
688  return min(1.0f, max(0.0f, likelihood));
689  } else {
690  return 1.0f;
691  }
692  }
693 
694  // Compute the incident angle probability.
695  __device__ inline float ComputeIncProb(
696  const float cos_incident_angle) const {
697  const float x = 1.0f - max(0.0f, cos_incident_angle);
698  return exp(x * x * inv_incident_angle_sigma_square_);
699  }
700 
701  // Compute the warping/resolution prior probability.
702  template <int kWindowSize>
703  __device__ inline float ComputeResolutionProb(const float H[9],
704  const float row,
705  const float col) const {
706  const int kWindowRadius = kWindowSize / 2;
707 
708  // Warp corners of patch in reference image to source image.
709  float src1[2];
710  const float ref1[2] = {col - kWindowRadius, row - kWindowRadius};
711  Mat33DotVec3Homogeneous(H, ref1, src1);
712  float src2[2];
713  const float ref2[2] = {col - kWindowRadius, row + kWindowRadius};
714  Mat33DotVec3Homogeneous(H, ref2, src2);
715  float src3[2];
716  const float ref3[2] = {col + kWindowRadius, row + kWindowRadius};
717  Mat33DotVec3Homogeneous(H, ref3, src3);
718  float src4[2];
719  const float ref4[2] = {col + kWindowRadius, row - kWindowRadius};
720  Mat33DotVec3Homogeneous(H, ref4, src4);
721 
722  // Compute area of patches in reference and source image.
723  const float ref_area = kWindowSize * kWindowSize;
724  const float src_area =
725  abs(0.5f *
726  (src1[0] * src2[1] - src2[0] * src1[1] - src1[0] * src4[1] +
727  src2[0] * src3[1] - src3[0] * src2[1] + src4[0] * src1[1] +
728  src3[0] * src4[1] - src4[0] * src3[1]));
729 
730  if (ref_area > src_area) {
731  return src_area / ref_area;
732  } else {
733  return ref_area / src_area;
734  }
735  }
736 
737 private:
738  // The normalization for the likelihood function, i.e. the normalization for
739  // the prior on the matching cost.
740  __device__ static inline float ComputeNCCCostNormFactor(
741  const float ncc_sigma) {
742  // A = sqrt(2pi)*sigma/2*erf(sqrt(2)/sigma)
743  // erf(x) = 2/sqrt(pi) * integral from 0 to x of exp(-t^2) dt
744  return 2.0f / (sqrt(2.0f * M_PI) * ncc_sigma *
745  erff(2.0f / (ncc_sigma * 1.414213562f)));
746  }
747 
748  // Compute the forward or backward message.
749  template <bool kForward>
750  __device__ inline float ComputeMessage(const float cost,
751  const float prev) const {
752  constexpr float kUniformProb = 0.5f;
753  constexpr float kNoChangeProb = 0.99999f;
754  const float kChangeProb = 1.0f - kNoChangeProb;
755  const float emission = ComputeNCCProb(cost);
756 
757  float zn0; // Message for selection probability = 0.
758  float zn1; // Message for selection probability = 1.
759  if (kForward) {
760  zn0 = (prev * kChangeProb + (1.0f - prev) * kNoChangeProb) *
761  kUniformProb;
762  zn1 = (prev * kNoChangeProb + (1.0f - prev) * kChangeProb) *
763  emission;
764  } else {
765  zn0 = prev * emission * kChangeProb +
766  (1.0f - prev) * kUniformProb * kNoChangeProb;
767  zn1 = prev * emission * kNoChangeProb +
768  (1.0f - prev) * kUniformProb * kChangeProb;
769  }
770 
771  return zn1 / (zn0 + zn1);
772  }
773 
774  const float cos_min_triangulation_angle_;
775  const float inv_incident_angle_sigma_square_;
776  const float inv_ncc_sigma_square_;
777  const float ncc_norm_factor_;
778 };
779 
780 // Rotate normals by 90deg around z-axis in counter-clockwise direction.
781 __global__ void InitNormalMap(GpuMat<float> normal_map,
782  GpuMat<curandState> rand_state_map) {
783  const int row = blockDim.y * blockIdx.y + threadIdx.y;
784  const int col = blockDim.x * blockIdx.x + threadIdx.x;
785  if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) {
786  curandState rand_state = rand_state_map.Get(row, col);
787  float normal[3];
788  GenerateRandomNormal(row, col, &rand_state, normal);
789  normal_map.SetSlice(row, col, normal);
790  rand_state_map.Set(row, col, rand_state);
791  }
792 }
793 
794 // Rotate normals by 90deg around z-axis in counter-clockwise direction.
795 __global__ void RotateNormalMap(GpuMat<float> normal_map) {
796  const int row = blockDim.y * blockIdx.y + threadIdx.y;
797  const int col = blockDim.x * blockIdx.x + threadIdx.x;
798  if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) {
799  float normal[3];
800  normal_map.GetSlice(row, col, normal);
801  float rotated_normal[3];
802  rotated_normal[0] = normal[1];
803  rotated_normal[1] = -normal[0];
804  rotated_normal[2] = normal[2];
805  normal_map.SetSlice(row, col, rotated_normal);
806  }
807 }
808 
809 template <int kWindowSize, int kWindowStep>
810 __global__ void ComputeInitialCost(GpuMat<float> cost_map,
811  const GpuMat<float> depth_map,
812  const GpuMat<float> normal_map,
813  const cudaTextureObject_t ref_image_texture,
814  const GpuMat<float> ref_sum_image,
815  const GpuMat<float> ref_squared_sum_image,
816  const cudaTextureObject_t src_images_texture,
817  const cudaTextureObject_t poses_texture,
818  const float sigma_spatial,
819  const float sigma_color) {
820  const int col = blockDim.x * blockIdx.x + threadIdx.x;
821 
822  typedef PhotoConsistencyCostComputer<kWindowSize, kWindowStep>
823  PhotoConsistencyCostComputerType;
824  PhotoConsistencyCostComputerType pcc_computer(
825  ref_image_texture, src_images_texture, poses_texture, sigma_spatial,
826  sigma_color);
827  pcc_computer.col = col;
828 
829  __shared__ float local_ref_image_data
830  [PhotoConsistencyCostComputerType::LocalRefImageType::kDataSize];
831  pcc_computer.local_ref_image.data = &local_ref_image_data[0];
832 
833  float normal[3] = {0};
834  pcc_computer.normal = normal;
835 
836  for (int row = 0; row < cost_map.GetHeight(); ++row) {
837  // Note that this must be executed even for pixels outside the borders,
838  // since pixels are used in the local neighborhood of the current pixel.
839  pcc_computer.Read(row);
840 
841  if (col < cost_map.GetWidth()) {
842  pcc_computer.depth = depth_map.Get(row, col);
843  normal_map.GetSlice(row, col, normal);
844 
845  pcc_computer.row = row;
846  pcc_computer.local_ref_sum = ref_sum_image.Get(row, col);
847  pcc_computer.local_ref_squared_sum =
848  ref_squared_sum_image.Get(row, col);
849 
850  for (int image_idx = 0; image_idx < cost_map.GetDepth();
851  ++image_idx) {
852  pcc_computer.src_image_idx = image_idx;
853  cost_map.Set(row, col, image_idx, pcc_computer.Compute());
854  }
855  }
856  }
857 }
858 
859 struct SweepOptions {
860  float perturbation = 1.0f;
861  float depth_min = 0.0f;
862  float depth_max = 1.0f;
863  int num_samples = 15;
864  float sigma_spatial = 3.0f;
865  float sigma_color = 0.3f;
866  float ncc_sigma = 0.6f;
867  float min_triangulation_angle = 0.5f;
868  float incident_angle_sigma = 0.9f;
869  float prev_sel_prob_weight = 0.0f;
870  float geom_consistency_regularizer = 0.1f;
871  float geom_consistency_max_cost = 5.0f;
872  float filter_min_ncc = 0.1f;
873  float filter_min_triangulation_angle = 3.0f;
874  int filter_min_num_consistent = 2;
875  float filter_geom_consistency_max_cost = 1.0f;
876 };
877 
878 template <int kWindowSize,
879  int kWindowStep,
880  bool kGeomConsistencyTerm = false,
881  bool kFilterPhotoConsistency = false,
882  bool kFilterGeomConsistency = false>
883 __global__ void SweepFromTopToBottom(
884  GpuMat<float> global_workspace,
885  GpuMat<curandState> rand_state_map,
886  GpuMat<float> cost_map,
887  GpuMat<float> depth_map,
888  GpuMat<float> normal_map,
889  GpuMat<uint8_t> consistency_mask,
890  GpuMat<float> sel_prob_map,
891  const GpuMat<float> prev_sel_prob_map,
892  const cudaTextureObject_t ref_image_texture,
893  const GpuMat<float> ref_sum_image,
894  const GpuMat<float> ref_squared_sum_image,
895  const cudaTextureObject_t src_images_texture,
896  const cudaTextureObject_t src_depth_maps_texture,
897  const cudaTextureObject_t poses_texture,
898  const SweepOptions options) {
899  const int col = blockDim.x * blockIdx.x + threadIdx.x;
900 
901  // Probability for boundary pixels.
902  constexpr float kUniformProb = 0.5f;
903 
904  LikelihoodComputer likelihood_computer(options.ncc_sigma,
905  options.min_triangulation_angle,
906  options.incident_angle_sigma);
907 
908  float* forward_message =
909  &global_workspace.GetPtr()[col * global_workspace.GetHeight()];
910  float* sampling_probs =
911  &global_workspace.GetPtr()[global_workspace.GetWidth() *
912  global_workspace.GetHeight() +
913  col * global_workspace.GetHeight()];
914 
915  //////////////////////////////////////////////////////////////////////////////
916  // Compute backward message for all rows. Note that the backward messages
917  // are temporarily stored in the sel_prob_map and replaced row by row as the
918  // updated forward messages are computed further below.
919  //////////////////////////////////////////////////////////////////////////////
920 
921  if (col < cost_map.GetWidth()) {
922  for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
923  // Compute backward message.
924  float beta = kUniformProb;
925  for (int row = cost_map.GetHeight() - 1; row >= 0; --row) {
926  const float cost = cost_map.Get(row, col, image_idx);
927  beta = likelihood_computer.ComputeBackwardMessage(cost, beta);
928  sel_prob_map.Set(row, col, image_idx, beta);
929  }
930 
931  // Initialize forward message.
932  forward_message[image_idx] = kUniformProb;
933  }
934  }
935 
936  //////////////////////////////////////////////////////////////////////////////
937  // Estimate parameters for remaining rows and compute selection
938  // probabilities.
939  //////////////////////////////////////////////////////////////////////////////
940 
941  typedef PhotoConsistencyCostComputer<kWindowSize, kWindowStep>
942  PhotoConsistencyCostComputerType;
943  PhotoConsistencyCostComputerType pcc_computer(
944  ref_image_texture, src_images_texture, poses_texture,
945  options.sigma_spatial, options.sigma_color);
946  pcc_computer.col = col;
947 
948  __shared__ float local_ref_image_data
949  [PhotoConsistencyCostComputerType::LocalRefImageType::kDataSize];
950  pcc_computer.local_ref_image.data = &local_ref_image_data[0];
951 
952  struct ParamState {
953  float depth = 0.0f;
954  float normal[3] = {0};
955  };
956 
957  // Parameters of previous pixel in column.
958  ParamState prev_param_state;
959  // Parameters of current pixel in column.
960  ParamState curr_param_state;
961  // Randomly sampled parameters.
962  ParamState rand_param_state;
963  // Cuda PRNG state for random sampling.
964  curandState rand_state;
965 
966  if (col < cost_map.GetWidth()) {
967  // Read random state for current column.
968  rand_state = rand_state_map.Get(0, col);
969  // Parameters for first row in column.
970  prev_param_state.depth = depth_map.Get(0, col);
971  normal_map.GetSlice(0, col, prev_param_state.normal);
972  }
973 
974  for (int row = 0; row < cost_map.GetHeight(); ++row) {
975  // Note that this must be executed even for pixels outside the borders,
976  // since pixels are used in the local neighborhood of the current pixel.
977  pcc_computer.Read(row);
978 
979  if (col >= cost_map.GetWidth()) {
980  continue;
981  }
982 
983  pcc_computer.row = row;
984  pcc_computer.local_ref_sum = ref_sum_image.Get(row, col);
985  pcc_computer.local_ref_squared_sum =
986  ref_squared_sum_image.Get(row, col);
987 
988  // Propagate the depth at which the current ray intersects with the
989  // plane of the normal of the previous ray. This helps to better
990  // estimate the depth of very oblique structures, i.e. pixels whose
991  // normal direction is significantly different from their viewing
992  // direction.
993  prev_param_state.depth = PropagateDepth(
994  prev_param_state.depth, prev_param_state.normal, row - 1, row);
995 
996  // Read parameters for current pixel from previous sweep.
997  curr_param_state.depth = depth_map.Get(row, col);
998  normal_map.GetSlice(row, col, curr_param_state.normal);
999 
1000  // Generate random parameters.
1001  rand_param_state.depth = PerturbDepth(
1002  options.perturbation, curr_param_state.depth, &rand_state);
1003  PerturbNormal(row, col, options.perturbation * M_PI,
1004  curr_param_state.normal, &rand_state,
1005  rand_param_state.normal);
1006 
1007  // Read in the backward message, compute selection probabilities and
1008  // modulate selection probabilities with priors.
1009 
1010  float point[3];
1011  ComputePointAtDepth(row, col, curr_param_state.depth, point);
1012 
1013  for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1014  const float cost = cost_map.Get(row, col, image_idx);
1015  const float alpha = likelihood_computer.ComputeForwardMessage(
1016  cost, forward_message[image_idx]);
1017  const float beta = sel_prob_map.Get(row, col, image_idx);
1018  const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx);
1019  const float sel_prob = likelihood_computer.ComputeSelProb(
1020  alpha, beta, prev_prob, options.prev_sel_prob_weight);
1021 
1022  float cos_triangulation_angle;
1023  float cos_incident_angle;
1024  ComputeViewingAngles(poses_texture, point, curr_param_state.normal,
1025  image_idx, &cos_triangulation_angle,
1026  &cos_incident_angle);
1027  const float tri_prob =
1028  likelihood_computer.ComputeTriProb(cos_triangulation_angle);
1029  const float inc_prob =
1030  likelihood_computer.ComputeIncProb(cos_incident_angle);
1031 
1032  float H[9];
1033  ComposeHomography(poses_texture, image_idx, row, col,
1034  curr_param_state.depth, curr_param_state.normal,
1035  H);
1036  const float res_prob =
1037  likelihood_computer.ComputeResolutionProb<kWindowSize>(
1038  H, row, col);
1039 
1040  sampling_probs[image_idx] =
1041  sel_prob * tri_prob * inc_prob * res_prob;
1042  }
1043 
1044  TransformPDFToCDF(sampling_probs, cost_map.GetDepth());
1045 
1046  // Compute matching cost using Monte Carlo sampling of source images.
1047  // Images with higher selection probability are more likely to be
1048  // sampled. Hence, if only very few source images see the reference
1049  // image pixel, the same source image is likely to be sampled many
1050  // times. Instead of taking the best K probabilities, this sampling
1051  // scheme has the advantage of being adaptive to any distribution of
1052  // selection probabilities.
1053 
1054  constexpr int kNumCosts = 5;
1055  float costs[kNumCosts] = {0};
1056  const float depths[kNumCosts] = {
1057  curr_param_state.depth, prev_param_state.depth,
1058  rand_param_state.depth, curr_param_state.depth,
1059  rand_param_state.depth};
1060  const float* normals[kNumCosts] = {
1061  curr_param_state.normal, prev_param_state.normal,
1062  rand_param_state.normal, rand_param_state.normal,
1063  curr_param_state.normal};
1064 
1065  for (int sample = 0; sample < options.num_samples; ++sample) {
1066  const float rand_prob = curand_uniform(&rand_state) - FLT_EPSILON;
1067 
1068  pcc_computer.src_image_idx = -1;
1069  for (int image_idx = 0; image_idx < cost_map.GetDepth();
1070  ++image_idx) {
1071  const float prob = sampling_probs[image_idx];
1072  if (prob > rand_prob) {
1073  pcc_computer.src_image_idx = image_idx;
1074  break;
1075  }
1076  }
1077 
1078  if (pcc_computer.src_image_idx == -1) {
1079  continue;
1080  }
1081 
1082  costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx);
1083  if (kGeomConsistencyTerm) {
1084  costs[0] += options.geom_consistency_regularizer *
1085  ComputeGeomConsistencyCost(
1086  poses_texture, src_depth_maps_texture, row,
1087  col, depths[0], pcc_computer.src_image_idx,
1088  options.geom_consistency_max_cost);
1089  }
1090 
1091  for (int i = 1; i < kNumCosts; ++i) {
1092  pcc_computer.depth = depths[i];
1093  pcc_computer.normal = normals[i];
1094  costs[i] += pcc_computer.Compute();
1095  if (kGeomConsistencyTerm) {
1096  costs[i] +=
1097  options.geom_consistency_regularizer *
1098  ComputeGeomConsistencyCost(
1099  poses_texture, src_depth_maps_texture, row,
1100  col, depths[i], pcc_computer.src_image_idx,
1101  options.geom_consistency_max_cost);
1102  }
1103  }
1104  }
1105 
1106  // Find the parameters of the minimum cost.
1107  const int min_cost_idx = FindMinCost<kNumCosts>(costs);
1108  const float best_depth = depths[min_cost_idx];
1109  const float* best_normal = normals[min_cost_idx];
1110 
1111  // Save best new parameters.
1112  depth_map.Set(row, col, best_depth);
1113  normal_map.SetSlice(row, col, best_normal);
1114 
1115  // Use the new cost to recompute the updated forward message and
1116  // the selection probability.
1117  pcc_computer.depth = best_depth;
1118  pcc_computer.normal = best_normal;
1119  for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
1120  // Determine the cost for best depth.
1121  float cost;
1122  if (min_cost_idx == 0) {
1123  cost = cost_map.Get(row, col, image_idx);
1124  } else {
1125  pcc_computer.src_image_idx = image_idx;
1126  cost = pcc_computer.Compute();
1127  cost_map.Set(row, col, image_idx, cost);
1128  }
1129 
1130  const float alpha = likelihood_computer.ComputeForwardMessage(
1131  cost, forward_message[image_idx]);
1132  const float beta = sel_prob_map.Get(row, col, image_idx);
1133  const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx);
1134  const float prob = likelihood_computer.ComputeSelProb(
1135  alpha, beta, prev_prob, options.prev_sel_prob_weight);
1136  forward_message[image_idx] = alpha;
1137  sel_prob_map.Set(row, col, image_idx, prob);
1138  }
1139 
1140  if (kFilterPhotoConsistency || kFilterGeomConsistency) {
1141  int num_consistent = 0;
1142 
1143  float best_point[3];
1144  ComputePointAtDepth(row, col, best_depth, best_point);
1145 
1146  const float min_ncc_prob = likelihood_computer.ComputeNCCProb(
1147  1.0f - options.filter_min_ncc);
1148  const float cos_min_triangulation_angle =
1149  cos(options.filter_min_triangulation_angle);
1150 
1151  for (int image_idx = 0; image_idx < cost_map.GetDepth();
1152  ++image_idx) {
1153  float cos_triangulation_angle;
1154  float cos_incident_angle;
1155  ComputeViewingAngles(poses_texture, best_point, best_normal,
1156  image_idx, &cos_triangulation_angle,
1157  &cos_incident_angle);
1158  if (cos_triangulation_angle > cos_min_triangulation_angle ||
1159  cos_incident_angle <= 0.0f) {
1160  continue;
1161  }
1162 
1163  if (!kFilterGeomConsistency) {
1164  if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob) {
1165  consistency_mask.Set(row, col, image_idx, 1);
1166  num_consistent += 1;
1167  }
1168  } else if (!kFilterPhotoConsistency) {
1169  if (ComputeGeomConsistencyCost(
1170  poses_texture, src_depth_maps_texture, row, col,
1171  best_depth, image_idx,
1172  options.geom_consistency_max_cost) <=
1173  options.filter_geom_consistency_max_cost) {
1174  consistency_mask.Set(row, col, image_idx, 1);
1175  num_consistent += 1;
1176  }
1177  } else {
1178  if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob &&
1179  ComputeGeomConsistencyCost(
1180  poses_texture, src_depth_maps_texture, row, col,
1181  best_depth, image_idx,
1182  options.geom_consistency_max_cost) <=
1183  options.filter_geom_consistency_max_cost) {
1184  consistency_mask.Set(row, col, image_idx, 1);
1185  num_consistent += 1;
1186  }
1187  }
1188  }
1189 
1190  if (num_consistent < options.filter_min_num_consistent) {
1191  depth_map.Set(row, col, 0.0f);
1192  normal_map.Set(row, col, 0, 0.0f);
1193  normal_map.Set(row, col, 1, 0.0f);
1194  normal_map.Set(row, col, 2, 0.0f);
1195  for (int image_idx = 0; image_idx < cost_map.GetDepth();
1196  ++image_idx) {
1197  consistency_mask.Set(row, col, image_idx, 0);
1198  }
1199  }
1200  }
1201 
1202  // Update previous depth for next row.
1203  prev_param_state.depth = best_depth;
1204  for (int i = 0; i < 3; ++i) {
1205  prev_param_state.normal[i] = best_normal[i];
1206  }
1207  }
1208 
1209  if (col < cost_map.GetWidth()) {
1210  rand_state_map.Set(0, col, rand_state);
1211  }
1212 }
1213 
1214 PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options,
1215  const PatchMatch::Problem& problem)
1216  : options_(options),
1217  problem_(problem),
1218  ref_width_(0),
1219  ref_height_(0),
1220  rotation_in_half_pi_(0) {
1221  SetBestCudaDevice(std::stoi(options_.gpu_index));
1222  InitRefImage();
1223  InitSourceImages();
1224  InitTransforms();
1225  InitWorkspaceMemory();
1226 }
1227 
1228 void PatchMatchCuda::Run() {
1229 #define CASE_WINDOW_RADIUS(window_radius, window_step) \
1230  case window_radius: \
1231  RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \
1232  break;
1233 
1234 #define CASE_WINDOW_STEP(window_step) \
1235  case window_step: \
1236  switch (options_.window_radius) { \
1237  CASE_WINDOW_RADIUS(1, window_step) \
1238  CASE_WINDOW_RADIUS(2, window_step) \
1239  CASE_WINDOW_RADIUS(3, window_step) \
1240  CASE_WINDOW_RADIUS(4, window_step) \
1241  CASE_WINDOW_RADIUS(5, window_step) \
1242  CASE_WINDOW_RADIUS(6, window_step) \
1243  CASE_WINDOW_RADIUS(7, window_step) \
1244  CASE_WINDOW_RADIUS(8, window_step) \
1245  CASE_WINDOW_RADIUS(9, window_step) \
1246  CASE_WINDOW_RADIUS(10, window_step) \
1247  CASE_WINDOW_RADIUS(11, window_step) \
1248  CASE_WINDOW_RADIUS(12, window_step) \
1249  CASE_WINDOW_RADIUS(13, window_step) \
1250  CASE_WINDOW_RADIUS(14, window_step) \
1251  CASE_WINDOW_RADIUS(15, window_step) \
1252  CASE_WINDOW_RADIUS(16, window_step) \
1253  CASE_WINDOW_RADIUS(17, window_step) \
1254  CASE_WINDOW_RADIUS(18, window_step) \
1255  CASE_WINDOW_RADIUS(19, window_step) \
1256  CASE_WINDOW_RADIUS(20, window_step) \
1257  default: { \
1258  LOG(ERROR) << "Window size " << options_.window_radius \
1259  << " not supported"; \
1260  break; \
1261  } \
1262  } \
1263  break;
1264 
1265  switch (options_.window_step) {
1266  CASE_WINDOW_STEP(1)
1267  CASE_WINDOW_STEP(2)
1268  default: {
1269  LOG(ERROR) << "Window step " << options_.window_step
1270  << " not supported";
1271  break;
1272  }
1273  }
1274 
1275 #undef SWITCH_WINDOW_RADIUS
1276 #undef CALL_RUN_FUNC
1277 }
1278 
1279 DepthMap PatchMatchCuda::GetDepthMap() const {
1280  return DepthMap(depth_map_->CopyToMat(), options_.depth_min,
1281  options_.depth_max);
1282 }
1283 
1284 NormalMap PatchMatchCuda::GetNormalMap() const {
1285  return NormalMap(normal_map_->CopyToMat());
1286 }
1287 
1288 Mat<float> PatchMatchCuda::GetSelProbMap() const {
1289  return prev_sel_prob_map_->CopyToMat();
1290 }
1291 
1292 std::vector<int> PatchMatchCuda::GetConsistentImageIdxs() const {
1293  const Mat<uint8_t> mask = consistency_mask_->CopyToMat();
1294  std::vector<int> consistent_image_idxs;
1295  std::vector<int> pixel_consistent_image_idxs;
1296  pixel_consistent_image_idxs.reserve(mask.GetDepth());
1297  for (size_t r = 0; r < mask.GetHeight(); ++r) {
1298  for (size_t c = 0; c < mask.GetWidth(); ++c) {
1299  pixel_consistent_image_idxs.clear();
1300  for (size_t d = 0; d < mask.GetDepth(); ++d) {
1301  if (mask.Get(r, c, d)) {
1302  pixel_consistent_image_idxs.push_back(
1303  problem_.src_image_idxs[d]);
1304  }
1305  }
1306  if (pixel_consistent_image_idxs.size() > 0) {
1307  consistent_image_idxs.push_back(c);
1308  consistent_image_idxs.push_back(r);
1309  consistent_image_idxs.push_back(
1310  pixel_consistent_image_idxs.size());
1311  consistent_image_idxs.insert(
1312  consistent_image_idxs.end(),
1313  pixel_consistent_image_idxs.begin(),
1314  pixel_consistent_image_idxs.end());
1315  }
1316  }
1317  }
1318  return consistent_image_idxs;
1319 }
1320 
1321 template <int kWindowSize, int kWindowStep>
1322 void PatchMatchCuda::RunWithWindowSizeAndStep() {
1323  // Wait for all initializations to finish.
1324  CUDA_SYNC_AND_CHECK();
1325 
1326  CudaTimer total_timer;
1327  CudaTimer init_timer;
1328 
1329  ComputeCudaConfig();
1330  ComputeInitialCost<kWindowSize, kWindowStep>
1331  <<<sweep_grid_size_, sweep_block_size_>>>(
1332  *cost_map_, *depth_map_, *normal_map_,
1333  ref_image_texture_->GetObj(), *ref_image_->sum_image,
1334  *ref_image_->squared_sum_image,
1335  src_images_texture_->GetObj(), poses_texture_[0]->GetObj(),
1336  options_.sigma_spatial, options_.sigma_color);
1337  CUDA_SYNC_AND_CHECK();
1338 
1339  init_timer.Print("Initialization");
1340 
1341  const float total_num_steps = options_.num_iterations * 4;
1342 
1343  SweepOptions sweep_options;
1344  sweep_options.depth_min = options_.depth_min;
1345  sweep_options.depth_max = options_.depth_max;
1346  sweep_options.sigma_spatial = options_.sigma_spatial;
1347  sweep_options.sigma_color = options_.sigma_color;
1348  sweep_options.num_samples = options_.num_samples;
1349  sweep_options.ncc_sigma = options_.ncc_sigma;
1350  sweep_options.min_triangulation_angle =
1351  DEG2RAD(options_.min_triangulation_angle);
1352  sweep_options.incident_angle_sigma = options_.incident_angle_sigma;
1353  sweep_options.geom_consistency_regularizer =
1354  options_.geom_consistency_regularizer;
1355  sweep_options.geom_consistency_max_cost =
1356  options_.geom_consistency_max_cost;
1357  sweep_options.filter_min_ncc = options_.filter_min_ncc;
1358  sweep_options.filter_min_triangulation_angle =
1359  DEG2RAD(options_.filter_min_triangulation_angle);
1360  sweep_options.filter_min_num_consistent =
1361  options_.filter_min_num_consistent;
1362  sweep_options.filter_geom_consistency_max_cost =
1363  options_.filter_geom_consistency_max_cost;
1364 
1365  for (int iter = 0; iter < options_.num_iterations; ++iter) {
1366  CudaTimer iter_timer;
1367 
1368  for (int sweep = 0; sweep < 4; ++sweep) {
1369  CudaTimer sweep_timer;
1370 
1371  // Expenentially reduce amount of perturbation during the
1372  // optimization.
1373  sweep_options.perturbation =
1374  1.0f / std::pow(2.0f, iter + sweep / 4.0f);
1375 
1376  // Linearly increase the influence of previous selection
1377  // probabilities.
1378  sweep_options.prev_sel_prob_weight =
1379  static_cast<float>(iter * 4 + sweep) / total_num_steps;
1380 
1381  const bool last_sweep =
1382  iter == options_.num_iterations - 1 && sweep == 3;
1383 
1384 #define CALL_SWEEP_FUNC \
1385  SweepFromTopToBottom<kWindowSize, kWindowStep, kGeomConsistencyTerm, \
1386  kFilterPhotoConsistency, kFilterGeomConsistency> \
1387  <<<sweep_grid_size_, sweep_block_size_>>>( \
1388  *global_workspace_, *rand_state_map_, *cost_map_, \
1389  *depth_map_, *normal_map_, *consistency_mask_, \
1390  *sel_prob_map_, *prev_sel_prob_map_, \
1391  ref_image_texture_->GetObj(), *ref_image_->sum_image, \
1392  *ref_image_->squared_sum_image, \
1393  src_images_texture_->GetObj(), \
1394  src_depth_maps_texture_ == nullptr \
1395  ? 0 \
1396  : src_depth_maps_texture_->GetObj(), \
1397  poses_texture_[rotation_in_half_pi_]->GetObj(), \
1398  sweep_options);
1399 
1400  if (last_sweep) {
1401  if (options_.filter) {
1402  consistency_mask_.reset(new GpuMat<uint8_t>(
1403  cost_map_->GetWidth(), cost_map_->GetHeight(),
1404  cost_map_->GetDepth()));
1405  consistency_mask_->FillWithScalar(0);
1406  }
1407  if (options_.geom_consistency) {
1408  const bool kGeomConsistencyTerm = true;
1409  if (options_.filter) {
1410  const bool kFilterPhotoConsistency = true;
1411  const bool kFilterGeomConsistency = true;
1412  CALL_SWEEP_FUNC
1413  } else {
1414  const bool kFilterPhotoConsistency = false;
1415  const bool kFilterGeomConsistency = false;
1416  CALL_SWEEP_FUNC
1417  }
1418  } else {
1419  const bool kGeomConsistencyTerm = false;
1420  if (options_.filter) {
1421  const bool kFilterPhotoConsistency = true;
1422  const bool kFilterGeomConsistency = false;
1423  CALL_SWEEP_FUNC
1424  } else {
1425  const bool kFilterPhotoConsistency = false;
1426  const bool kFilterGeomConsistency = false;
1427  CALL_SWEEP_FUNC
1428  }
1429  }
1430  } else {
1431  const bool kFilterPhotoConsistency = false;
1432  const bool kFilterGeomConsistency = false;
1433  if (options_.geom_consistency) {
1434  const bool kGeomConsistencyTerm = true;
1435  CALL_SWEEP_FUNC
1436  } else {
1437  const bool kGeomConsistencyTerm = false;
1438  CALL_SWEEP_FUNC
1439  }
1440  }
1441 
1442 #undef CALL_SWEEP_FUNC
1443 
1444  CUDA_SYNC_AND_CHECK();
1445 
1446  Rotate();
1447 
1448  // Rotate selected image map.
1449  if (last_sweep && options_.filter) {
1450  std::unique_ptr<GpuMat<uint8_t>> rot_consistency_mask_(
1451  new GpuMat<uint8_t>(cost_map_->GetWidth(),
1452  cost_map_->GetHeight(),
1453  cost_map_->GetDepth()));
1454  consistency_mask_->Rotate(rot_consistency_mask_.get());
1455  consistency_mask_.swap(rot_consistency_mask_);
1456  }
1457 
1458  sweep_timer.Print(" Sweep " + std::to_string(sweep + 1));
1459  }
1460 
1461  iter_timer.Print("Iteration " + std::to_string(iter + 1));
1462  }
1463 
1464  total_timer.Print("Total");
1465 }
1466 
1467 void PatchMatchCuda::ComputeCudaConfig() {
1468  sweep_block_size_.x = THREADS_PER_BLOCK;
1469  sweep_block_size_.y = 1;
1470  sweep_block_size_.z = 1;
1471  sweep_grid_size_.x = (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1;
1472  sweep_grid_size_.y = 1;
1473  sweep_grid_size_.z = 1;
1474 
1475  elem_wise_block_size_.x = THREADS_PER_BLOCK;
1476  elem_wise_block_size_.y = THREADS_PER_BLOCK;
1477  elem_wise_block_size_.z = 1;
1478  elem_wise_grid_size_.x =
1479  (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1;
1480  elem_wise_grid_size_.y =
1481  (depth_map_->GetHeight() - 1) / THREADS_PER_BLOCK + 1;
1482  elem_wise_grid_size_.z = 1;
1483 }
1484 
1485 void PatchMatchCuda::BindRefImageTexture() {
1486  cudaTextureDesc texture_desc;
1487  memset(&texture_desc, 0, sizeof(texture_desc));
1488  texture_desc.addressMode[0] = cudaAddressModeBorder;
1489  texture_desc.addressMode[1] = cudaAddressModeBorder;
1490  texture_desc.addressMode[2] = cudaAddressModeBorder;
1491  texture_desc.filterMode = cudaFilterModePoint;
1492  texture_desc.readMode = cudaReadModeNormalizedFloat;
1493  texture_desc.normalizedCoords = false;
1494  ref_image_texture_ = CudaArrayLayeredTexture<uint8_t>::FromGpuMat(
1495  texture_desc, *ref_image_->image);
1496 }
1497 
1498 void PatchMatchCuda::InitRefImage() {
1499  const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
1500 
1501  ref_width_ = ref_image.GetWidth();
1502  ref_height_ = ref_image.GetHeight();
1503 
1504  // Upload to device and filter.
1505  ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_));
1506  const std::vector<uint8_t> ref_image_array =
1507  ref_image.GetBitmap().ConvertToRowMajorArray();
1508  ref_image_->Filter(ref_image_array.data(), options_.window_radius,
1509  options_.window_step, options_.sigma_spatial,
1510  options_.sigma_color);
1511 
1512  BindRefImageTexture();
1513 }
1514 
1515 void PatchMatchCuda::InitSourceImages() {
1516  // Determine maximum image size.
1517  size_t max_width = 0;
1518  size_t max_height = 0;
1519  for (const auto image_idx : problem_.src_image_idxs) {
1520  const Image& image = problem_.images->at(image_idx);
1521  if (image.GetWidth() > max_width) {
1522  max_width = image.GetWidth();
1523  }
1524  if (image.GetHeight() > max_height) {
1525  max_height = image.GetHeight();
1526  }
1527  }
1528 
1529  // Upload source images to device.
1530  {
1531  // Copy source images to contiguous memory block.
1532  const uint8_t kDefaultValue = 0;
1533  std::vector<uint8_t> src_images_host_data(
1534  static_cast<size_t>(max_width * max_height *
1535  problem_.src_image_idxs.size()),
1536  kDefaultValue);
1537  for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) {
1538  const Image& image =
1539  problem_.images->at(problem_.src_image_idxs[i]);
1540  const Bitmap& bitmap = image.GetBitmap();
1541  uint8_t* dest =
1542  src_images_host_data.data() + max_width * max_height * i;
1543  for (size_t r = 0; r < image.GetHeight(); ++r) {
1544  memcpy(dest, bitmap.GetScanline(r),
1545  image.GetWidth() * sizeof(uint8_t));
1546  dest += max_width;
1547  }
1548  }
1549 
1550  // Create source images texture.
1551  cudaTextureDesc texture_desc;
1552  memset(&texture_desc, 0, sizeof(texture_desc));
1553  texture_desc.addressMode[0] = cudaAddressModeBorder;
1554  texture_desc.addressMode[1] = cudaAddressModeBorder;
1555  texture_desc.addressMode[2] = cudaAddressModeBorder;
1556  texture_desc.filterMode = cudaFilterModeLinear;
1557  texture_desc.readMode = cudaReadModeNormalizedFloat;
1558  texture_desc.normalizedCoords = false;
1559  src_images_texture_ = CudaArrayLayeredTexture<uint8_t>::FromHostArray(
1560  texture_desc, max_width, max_height,
1561  problem_.src_image_idxs.size(), src_images_host_data.data());
1562  }
1563 
1564  // Upload source depth maps to device.
1565  if (options_.geom_consistency) {
1566  const float kDefaultValue = 0.0f;
1567  std::vector<float> src_depth_maps_host_data(
1568  static_cast<size_t>(max_width * max_height *
1569  problem_.src_image_idxs.size()),
1570  kDefaultValue);
1571  for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) {
1572  const DepthMap& depth_map =
1573  problem_.depth_maps->at(problem_.src_image_idxs[i]);
1574  float* dest = src_depth_maps_host_data.data() +
1575  max_width * max_height * i;
1576  for (size_t r = 0; r < depth_map.GetHeight(); ++r) {
1577  memcpy(dest, depth_map.GetPtr() + r * depth_map.GetWidth(),
1578  depth_map.GetWidth() * sizeof(float));
1579  dest += max_width;
1580  }
1581  }
1582 
1583  // Create source depth maps texture.
1584  cudaTextureDesc texture_desc;
1585  memset(&texture_desc, 0, sizeof(texture_desc));
1586  texture_desc.addressMode[0] = cudaAddressModeBorder;
1587  texture_desc.addressMode[1] = cudaAddressModeBorder;
1588  texture_desc.addressMode[2] = cudaAddressModeBorder;
1589  texture_desc.filterMode = cudaFilterModePoint;
1590  texture_desc.readMode = cudaReadModeElementType;
1591  texture_desc.normalizedCoords = false;
1592  src_depth_maps_texture_ = CudaArrayLayeredTexture<float>::FromHostArray(
1593  texture_desc, max_width, max_height,
1594  problem_.src_image_idxs.size(),
1595  src_depth_maps_host_data.data());
1596  }
1597 }
1598 
1599 void PatchMatchCuda::InitTransforms() {
1600  const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
1601 
1602  //////////////////////////////////////////////////////////////////////////////
1603  // Generate rotated versions (counter-clockwise) of calibration matrix.
1604  //////////////////////////////////////////////////////////////////////////////
1605 
1606  for (size_t i = 0; i < 4; ++i) {
1607  ref_K_host_[i][0] = ref_image.GetK()[0];
1608  ref_K_host_[i][1] = ref_image.GetK()[2];
1609  ref_K_host_[i][2] = ref_image.GetK()[4];
1610  ref_K_host_[i][3] = ref_image.GetK()[5];
1611  }
1612 
1613  // Rotated by 90 degrees.
1614  std::swap(ref_K_host_[1][0], ref_K_host_[1][2]);
1615  std::swap(ref_K_host_[1][1], ref_K_host_[1][3]);
1616  ref_K_host_[1][3] = ref_width_ - 1 - ref_K_host_[1][3];
1617 
1618  // Rotated by 180 degrees.
1619  ref_K_host_[2][1] = ref_width_ - 1 - ref_K_host_[2][1];
1620  ref_K_host_[2][3] = ref_height_ - 1 - ref_K_host_[2][3];
1621 
1622  // Rotated by 270 degrees.
1623  std::swap(ref_K_host_[3][0], ref_K_host_[3][2]);
1624  std::swap(ref_K_host_[3][1], ref_K_host_[3][3]);
1625  ref_K_host_[3][1] = ref_height_ - 1 - ref_K_host_[3][1];
1626 
1627  // Extract 1/fx, -cx/fx, fy, -cy/fy.
1628  for (size_t i = 0; i < 4; ++i) {
1629  ref_inv_K_host_[i][0] = 1.0f / ref_K_host_[i][0];
1630  ref_inv_K_host_[i][1] = -ref_K_host_[i][1] / ref_K_host_[i][0];
1631  ref_inv_K_host_[i][2] = 1.0f / ref_K_host_[i][2];
1632  ref_inv_K_host_[i][3] = -ref_K_host_[i][3] / ref_K_host_[i][2];
1633  }
1634 
1635  // Bind 0 degrees version to constant global memory.
1636  CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[0], sizeof(float) * 4,
1637  0, cudaMemcpyHostToDevice));
1638  CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[0],
1639  sizeof(float) * 4, 0,
1640  cudaMemcpyHostToDevice));
1641 
1642  //////////////////////////////////////////////////////////////////////////////
1643  // Generate rotated versions of camera poses.
1644  //////////////////////////////////////////////////////////////////////////////
1645 
1646  float rotated_R[9];
1647  memcpy(rotated_R, ref_image.GetR(), 9 * sizeof(float));
1648 
1649  float rotated_T[3];
1650  memcpy(rotated_T, ref_image.GetT(), 3 * sizeof(float));
1651 
1652  // Matrix for 90deg rotation around Z-axis in counter-clockwise direction.
1653  const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1};
1654 
1655  cudaTextureDesc texture_desc;
1656  memset(&texture_desc, 0, sizeof(texture_desc));
1657  texture_desc.addressMode[0] = cudaAddressModeBorder;
1658  texture_desc.addressMode[1] = cudaAddressModeBorder;
1659  texture_desc.addressMode[2] = cudaAddressModeBorder;
1660  texture_desc.filterMode = cudaFilterModePoint;
1661  texture_desc.readMode = cudaReadModeElementType;
1662  texture_desc.normalizedCoords = false;
1663 
1664  for (size_t i = 0; i < 4; ++i) {
1665  const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12;
1666  std::vector<float> poses_host_data(kNumTformParams *
1667  problem_.src_image_idxs.size());
1668  int offset = 0;
1669  for (const auto image_idx : problem_.src_image_idxs) {
1670  const Image& image = problem_.images->at(image_idx);
1671 
1672  const float K[4] = {image.GetK()[0], image.GetK()[2],
1673  image.GetK()[4], image.GetK()[5]};
1674  memcpy(poses_host_data.data() + offset, K, 4 * sizeof(float));
1675  offset += 4;
1676 
1677  float rel_R[9];
1678  float rel_T[3];
1679  ComputeRelativePose(rotated_R, rotated_T, image.GetR(),
1680  image.GetT(), rel_R, rel_T);
1681  memcpy(poses_host_data.data() + offset, rel_R, 9 * sizeof(float));
1682  offset += 9;
1683  memcpy(poses_host_data.data() + offset, rel_T, 3 * sizeof(float));
1684  offset += 3;
1685 
1686  float C[3];
1687  ComputeProjectionCenter(rel_R, rel_T, C);
1688  memcpy(poses_host_data.data() + offset, C, 3 * sizeof(float));
1689  offset += 3;
1690 
1691  float P[12];
1692  ComposeProjectionMatrix(image.GetK(), rel_R, rel_T, P);
1693  memcpy(poses_host_data.data() + offset, P, 12 * sizeof(float));
1694  offset += 12;
1695 
1696  float inv_P[12];
1697  ComposeInverseProjectionMatrix(image.GetK(), rel_R, rel_T, inv_P);
1698  memcpy(poses_host_data.data() + offset, inv_P, 12 * sizeof(float));
1699  offset += 12;
1700  }
1701 
1702  poses_texture_[i] = CudaArrayLayeredTexture<float>::FromHostArray(
1703  texture_desc, kNumTformParams, problem_.src_image_idxs.size(),
1704  1, poses_host_data.data());
1705 
1706  RotatePose(R_z90, rotated_R, rotated_T);
1707  }
1708 }
1709 
1710 void PatchMatchCuda::InitWorkspaceMemory() {
1711  rand_state_map_.reset(new GpuMatPRNG(ref_width_, ref_height_));
1712 
1713  depth_map_.reset(new GpuMat<float>(ref_width_, ref_height_));
1714  if (options_.geom_consistency) {
1715  const DepthMap& init_depth_map =
1716  problem_.depth_maps->at(problem_.ref_image_idx);
1717  depth_map_->CopyToDevice(init_depth_map.GetPtr(),
1718  init_depth_map.GetWidth() * sizeof(float));
1719  } else {
1720  depth_map_->FillWithRandomNumbers(options_.depth_min,
1721  options_.depth_max, *rand_state_map_);
1722  }
1723 
1724  normal_map_.reset(new GpuMat<float>(ref_width_, ref_height_, 3));
1725 
1726  // Note that it is not necessary to keep the selection probability map in
1727  // memory for all pixels. Theoretically, it is possible to incorporate
1728  // the temporary selection probabilities in the global_workspace_.
1729  // However, it is useful to keep the probabilities for the entire image
1730  // in memory, so that it can be exported.
1731  sel_prob_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1732  problem_.src_image_idxs.size()));
1733  prev_sel_prob_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1734  problem_.src_image_idxs.size()));
1735  prev_sel_prob_map_->FillWithScalar(0.5f);
1736 
1737  cost_map_.reset(new GpuMat<float>(ref_width_, ref_height_,
1738  problem_.src_image_idxs.size()));
1739 
1740  const int ref_max_dim = std::max(ref_width_, ref_height_);
1741  global_workspace_.reset(
1742  new GpuMat<float>(ref_max_dim, problem_.src_image_idxs.size(), 2));
1743 
1744  consistency_mask_.reset(new GpuMat<uint8_t>(0, 0, 0));
1745 
1746  ComputeCudaConfig();
1747 
1748  if (options_.geom_consistency) {
1749  const NormalMap& init_normal_map =
1750  problem_.normal_maps->at(problem_.ref_image_idx);
1751  normal_map_->CopyToDevice(init_normal_map.GetPtr(),
1752  init_normal_map.GetWidth() * sizeof(float));
1753  } else {
1754  InitNormalMap<<<elem_wise_grid_size_, elem_wise_block_size_>>>(
1755  *normal_map_, *rand_state_map_);
1756  }
1757 }
1758 
1759 void PatchMatchCuda::Rotate() {
1760  rotation_in_half_pi_ = (rotation_in_half_pi_ + 1) % 4;
1761 
1762  size_t width;
1763  size_t height;
1764  if (rotation_in_half_pi_ % 2 == 0) {
1765  width = ref_width_;
1766  height = ref_height_;
1767  } else {
1768  width = ref_height_;
1769  height = ref_width_;
1770  }
1771 
1772  // Rotate random map.
1773  {
1774  std::unique_ptr<GpuMatPRNG> rotated_rand_state_map(
1775  new GpuMatPRNG(width, height));
1776  rand_state_map_->Rotate(rotated_rand_state_map.get());
1777  rand_state_map_.swap(rotated_rand_state_map);
1778  }
1779 
1780  // Rotate depth map.
1781  {
1782  std::unique_ptr<GpuMat<float>> rotated_depth_map(
1783  new GpuMat<float>(width, height));
1784  depth_map_->Rotate(rotated_depth_map.get());
1785  depth_map_.swap(rotated_depth_map);
1786  }
1787 
1788  // Rotate normal map.
1789  {
1790  RotateNormalMap<<<elem_wise_grid_size_, elem_wise_block_size_>>>(
1791  *normal_map_);
1792  std::unique_ptr<GpuMat<float>> rotated_normal_map(
1793  new GpuMat<float>(width, height, 3));
1794  normal_map_->Rotate(rotated_normal_map.get());
1795  normal_map_.swap(rotated_normal_map);
1796  }
1797 
1798  // Rotate reference image.
1799  {
1800  std::unique_ptr<GpuMatRefImage> rotated_ref_image(
1801  new GpuMatRefImage(width, height));
1802  ref_image_->image->Rotate(rotated_ref_image->image.get());
1803  ref_image_->sum_image->Rotate(rotated_ref_image->sum_image.get());
1804  ref_image_->squared_sum_image->Rotate(
1805  rotated_ref_image->squared_sum_image.get());
1806  ref_image_.swap(rotated_ref_image);
1807  BindRefImageTexture();
1808  }
1809 
1810  // Rotate selection probability map.
1811  prev_sel_prob_map_.reset(
1812  new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1813  sel_prob_map_->Rotate(prev_sel_prob_map_.get());
1814  sel_prob_map_.reset(
1815  new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
1816 
1817  // Rotate cost map.
1818  {
1819  std::unique_ptr<GpuMat<float>> rotated_cost_map(new GpuMat<float>(
1820  width, height, problem_.src_image_idxs.size()));
1821  cost_map_->Rotate(rotated_cost_map.get());
1822  cost_map_.swap(rotated_cost_map);
1823  }
1824 
1825  // Rotate calibration.
1826  CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_],
1827  sizeof(float) * 4, 0,
1828  cudaMemcpyHostToDevice));
1829  CUDA_SAFE_CALL(
1830  cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[rotation_in_half_pi_],
1831  sizeof(float) * 4, 0, cudaMemcpyHostToDevice));
1832 
1833  // Recompute Cuda configuration for rotated reference image.
1834  ComputeCudaConfig();
1835 }
1836 
1837 } // namespace mvs
1838 } // namespace colmap