ACloudViewer  3.9.4
A Modern Library for 3D Data Processing
SVD3x3.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - CloudViewer: www.cloudViewer.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.cloudViewer.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 /**************************************************************************
8 **
9 ** svd3
10 **
11 ** Quick singular value decomposition as described by:
12 ** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
13 ** Computing the Singular Value Decomposition of 3x3 matrices
14 ** with minimal branching and elementary floating point operations,
15 ** University of Wisconsin - Madison technical report TR1690, May 2011
16 **
17 ** Implemented by: Kui Wu
18 ** kwu@cs.utah.edu
19 ** Modified by: Wei Dong and Rishabh Singh
20 **
21 ** May 2018
22 **
23 **************************************************************************/
24 
25 #pragma once
26 
27 #include <cmath>
28 
31 
32 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
33 #include <cuda.h>
34 #endif
35 
37 #include "math.h"
38 
39 #define gone 1065353216
40 #define gsine_pi_over_eight 1053028117
41 
42 #define gcosine_pi_over_eight 1064076127
43 #define gtiny_number 1.e-20
44 #define gfour_gamma_squared 5.8284273147583007813
45 
46 #ifndef __CUDACC__
47 using std::abs;
48 using std::max;
49 using std::sqrt;
50 #endif
51 
52 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
53 #define __fadd_rn(x, y) __fadd_rn(x, y)
54 #define __fsub_rn(x, y) __fsub_rn(x, y)
55 #define __frsqrt_rn(x) __frsqrt_rn(x)
56 
57 #define __dadd_rn(x, y) __dadd_rn(x, y)
58 #define __dsub_rn(x, y) __dsub_rn(x, y)
59 #define __drsqrt_rn(x) __drcp_rn(__dsqrt_rn(x))
60 #else
61 
62 #define __fadd_rn(x, y) (x + y)
63 #define __fsub_rn(x, y) (x - y)
64 #define __frsqrt_rn(x) (1.0 / sqrt(x))
65 
66 #define __dadd_rn(x, y) (x + y)
67 #define __dsub_rn(x, y) (x - y)
68 #define __drsqrt_rn(x) (1.0 / sqrt(x))
69 
70 #define __add_rn(x, y) (x + y)
71 #define __sub_rn(x, y) (x - y)
72 #define __rsqrt_rn(x) (1.0 / sqrt(x))
73 #endif
74 
75 namespace cloudViewer {
76 namespace core {
77 namespace linalg {
78 namespace kernel {
79 
80 template <typename scalar_t>
81 union un {
82  scalar_t f;
83  unsigned int ui;
84 };
85 
86 template <typename scalar_t>
88  scalar_t *U_3x3,
89  scalar_t *S_3x1,
90  scalar_t *V_3x3);
91 
92 template <>
94  const double *A_3x3, double *U_3x3, double *S_3x1, double *V_3x3) {
95  double gsmall_number = 1.e-12;
96 
97  un<double> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
98  un<double> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
99  un<double> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
100  un<double> Sc, Ss, Sch, Ssh;
101  un<double> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
102  un<double> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
103  un<double> Sqvs, Sqvvx, Sqvvy, Sqvvz;
104 
105  Sa11.f = A_3x3[0];
106  Sa12.f = A_3x3[1];
107  Sa13.f = A_3x3[2];
108  Sa21.f = A_3x3[3];
109  Sa22.f = A_3x3[4];
110  Sa23.f = A_3x3[5];
111  Sa31.f = A_3x3[6];
112  Sa32.f = A_3x3[7];
113  Sa33.f = A_3x3[8];
114 
115  // ###########################################################
116  // Compute normal equations matrix
117  // ###########################################################
118 
119  Ss11.f = Sa11.f * Sa11.f;
120  Stmp1.f = Sa21.f * Sa21.f;
121  Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
122  Stmp1.f = Sa31.f * Sa31.f;
123  Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
124 
125  Ss21.f = Sa12.f * Sa11.f;
126  Stmp1.f = Sa22.f * Sa21.f;
127  Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
128  Stmp1.f = Sa32.f * Sa31.f;
129  Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
130 
131  Ss31.f = Sa13.f * Sa11.f;
132  Stmp1.f = Sa23.f * Sa21.f;
133  Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
134  Stmp1.f = Sa33.f * Sa31.f;
135  Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
136 
137  Ss22.f = Sa12.f * Sa12.f;
138  Stmp1.f = Sa22.f * Sa22.f;
139  Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
140  Stmp1.f = Sa32.f * Sa32.f;
141  Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
142 
143  Ss32.f = Sa13.f * Sa12.f;
144  Stmp1.f = Sa23.f * Sa22.f;
145  Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
146  Stmp1.f = Sa33.f * Sa32.f;
147  Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
148 
149  Ss33.f = Sa13.f * Sa13.f;
150  Stmp1.f = Sa23.f * Sa23.f;
151  Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
152  Stmp1.f = Sa33.f * Sa33.f;
153  Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
154 
155  Sqvs.f = 1.f;
156  Sqvvx.f = 0.f;
157  Sqvvy.f = 0.f;
158  Sqvvz.f = 0.f;
159 
160  // ###########################################################
161  // Solve symmetric eigenproblem using Jacobi iteration
162  // ###########################################################
163  for (int i = 0; i < 4; i++) {
164  Ssh.f = Ss21.f * 0.5f;
165  Stmp5.f = __dsub_rn(Ss11.f, Ss22.f);
166 
167  Stmp2.f = Ssh.f * Ssh.f;
168  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
169  Ssh.ui = Stmp1.ui & Ssh.ui;
170  Sch.ui = Stmp1.ui & Stmp5.ui;
171  Stmp2.ui = ~Stmp1.ui & gone;
172  Sch.ui = Sch.ui | Stmp2.ui;
173 
174  Stmp1.f = Ssh.f * Ssh.f;
175  Stmp2.f = Sch.f * Sch.f;
176  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
177  Stmp4.f = __drsqrt_rn(Stmp3.f);
178 
179  Ssh.f = Stmp4.f * Ssh.f;
180  Sch.f = Stmp4.f * Sch.f;
181  Stmp1.f = gfour_gamma_squared * Stmp1.f;
182  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
183 
184  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
185  Ssh.ui = ~Stmp1.ui & Ssh.ui;
186  Ssh.ui = Ssh.ui | Stmp2.ui;
187  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
188  Sch.ui = ~Stmp1.ui & Sch.ui;
189  Sch.ui = Sch.ui | Stmp2.ui;
190 
191  Stmp1.f = Ssh.f * Ssh.f;
192  Stmp2.f = Sch.f * Sch.f;
193  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
194  Ss.f = Sch.f * Ssh.f;
195  Ss.f = __dadd_rn(Ss.f, Ss.f);
196 
197 #ifdef DEBUG_JACOBI_CONJUGATE
198  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
199  Sch.f);
200 #endif
201  // ###########################################################
202  // Perform the actual Givens conjugation
203  // ###########################################################
204 
205  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
206  Ss33.f = Ss33.f * Stmp3.f;
207  Ss31.f = Ss31.f * Stmp3.f;
208  Ss32.f = Ss32.f * Stmp3.f;
209  Ss33.f = Ss33.f * Stmp3.f;
210 
211  Stmp1.f = Ss.f * Ss31.f;
212  Stmp2.f = Ss.f * Ss32.f;
213  Ss31.f = Sc.f * Ss31.f;
214  Ss32.f = Sc.f * Ss32.f;
215  Ss31.f = __dadd_rn(Stmp2.f, Ss31.f);
216  Ss32.f = __dsub_rn(Ss32.f, Stmp1.f);
217 
218  Stmp2.f = Ss.f * Ss.f;
219  Stmp1.f = Ss22.f * Stmp2.f;
220  Stmp3.f = Ss11.f * Stmp2.f;
221  Stmp4.f = Sc.f * Sc.f;
222  Ss11.f = Ss11.f * Stmp4.f;
223  Ss22.f = Ss22.f * Stmp4.f;
224  Ss11.f = __dadd_rn(Ss11.f, Stmp1.f);
225  Ss22.f = __dadd_rn(Ss22.f, Stmp3.f);
226  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
227  Stmp2.f = __dadd_rn(Ss21.f, Ss21.f);
228  Ss21.f = Ss21.f * Stmp4.f;
229  Stmp4.f = Sc.f * Ss.f;
230  Stmp2.f = Stmp2.f * Stmp4.f;
231  Stmp5.f = Stmp5.f * Stmp4.f;
232  Ss11.f = __dadd_rn(Ss11.f, Stmp2.f);
233  Ss21.f = __dsub_rn(Ss21.f, Stmp5.f);
234  Ss22.f = __dsub_rn(Ss22.f, Stmp2.f);
235 
236 #ifdef DEBUG_JACOBI_CONJUGATE
237  printf("%.20g\n", Ss11.f);
238  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
239  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
240 #endif
241 
242  // ###########################################################
243  // Compute the cumulative rotation, in quaternion form
244  // ###########################################################
245 
246  Stmp1.f = Ssh.f * Sqvvx.f;
247  Stmp2.f = Ssh.f * Sqvvy.f;
248  Stmp3.f = Ssh.f * Sqvvz.f;
249  Ssh.f = Ssh.f * Sqvs.f;
250 
251  Sqvs.f = Sch.f * Sqvs.f;
252  Sqvvx.f = Sch.f * Sqvvx.f;
253  Sqvvy.f = Sch.f * Sqvvy.f;
254  Sqvvz.f = Sch.f * Sqvvz.f;
255 
256  Sqvvz.f = __dadd_rn(Sqvvz.f, Ssh.f);
257  Sqvs.f = __dsub_rn(Sqvs.f, Stmp3.f);
258  Sqvvx.f = __dadd_rn(Sqvvx.f, Stmp2.f);
259  Sqvvy.f = __dsub_rn(Sqvvy.f, Stmp1.f);
260 
261 #ifdef DEBUG_JACOBI_CONJUGATE
262  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
263  Sqvs.f);
264 #endif
265 
267  // (1->3)
269  Ssh.f = Ss32.f * 0.5f;
270  Stmp5.f = __dsub_rn(Ss22.f, Ss33.f);
271 
272  Stmp2.f = Ssh.f * Ssh.f;
273  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
274  Ssh.ui = Stmp1.ui & Ssh.ui;
275  Sch.ui = Stmp1.ui & Stmp5.ui;
276  Stmp2.ui = ~Stmp1.ui & gone;
277  Sch.ui = Sch.ui | Stmp2.ui;
278 
279  Stmp1.f = Ssh.f * Ssh.f;
280  Stmp2.f = Sch.f * Sch.f;
281  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
282  Stmp4.f = __drsqrt_rn(Stmp3.f);
283 
284  Ssh.f = Stmp4.f * Ssh.f;
285  Sch.f = Stmp4.f * Sch.f;
286  Stmp1.f = gfour_gamma_squared * Stmp1.f;
287  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
288 
289  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
290  Ssh.ui = ~Stmp1.ui & Ssh.ui;
291  Ssh.ui = Ssh.ui | Stmp2.ui;
292  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
293  Sch.ui = ~Stmp1.ui & Sch.ui;
294  Sch.ui = Sch.ui | Stmp2.ui;
295 
296  Stmp1.f = Ssh.f * Ssh.f;
297  Stmp2.f = Sch.f * Sch.f;
298  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
299  Ss.f = Sch.f * Ssh.f;
300  Ss.f = __dadd_rn(Ss.f, Ss.f);
301 
302 #ifdef DEBUG_JACOBI_CONJUGATE
303  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
304  Sch.f);
305 #endif
306 
307  // ###########################################################
308  // Perform the actual Givens conjugation
309  // ###########################################################
310 
311  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
312  Ss11.f = Ss11.f * Stmp3.f;
313  Ss21.f = Ss21.f * Stmp3.f;
314  Ss31.f = Ss31.f * Stmp3.f;
315  Ss11.f = Ss11.f * Stmp3.f;
316 
317  Stmp1.f = Ss.f * Ss21.f;
318  Stmp2.f = Ss.f * Ss31.f;
319  Ss21.f = Sc.f * Ss21.f;
320  Ss31.f = Sc.f * Ss31.f;
321  Ss21.f = __dadd_rn(Stmp2.f, Ss21.f);
322  Ss31.f = __dsub_rn(Ss31.f, Stmp1.f);
323 
324  Stmp2.f = Ss.f * Ss.f;
325  Stmp1.f = Ss33.f * Stmp2.f;
326  Stmp3.f = Ss22.f * Stmp2.f;
327  Stmp4.f = Sc.f * Sc.f;
328  Ss22.f = Ss22.f * Stmp4.f;
329  Ss33.f = Ss33.f * Stmp4.f;
330  Ss22.f = __dadd_rn(Ss22.f, Stmp1.f);
331  Ss33.f = __dadd_rn(Ss33.f, Stmp3.f);
332  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
333  Stmp2.f = __dadd_rn(Ss32.f, Ss32.f);
334  Ss32.f = Ss32.f * Stmp4.f;
335  Stmp4.f = Sc.f * Ss.f;
336  Stmp2.f = Stmp2.f * Stmp4.f;
337  Stmp5.f = Stmp5.f * Stmp4.f;
338  Ss22.f = __dadd_rn(Ss22.f, Stmp2.f);
339  Ss32.f = __dsub_rn(Ss32.f, Stmp5.f);
340  Ss33.f = __dsub_rn(Ss33.f, Stmp2.f);
341 
342 #ifdef DEBUG_JACOBI_CONJUGATE
343  printf("%.20g\n", Ss11.f);
344  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
345  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
346 #endif
347 
348  // ###########################################################
349  // Compute the cumulative rotation, in quaternion form
350  // ###########################################################
351 
352  Stmp1.f = Ssh.f * Sqvvx.f;
353  Stmp2.f = Ssh.f * Sqvvy.f;
354  Stmp3.f = Ssh.f * Sqvvz.f;
355  Ssh.f = Ssh.f * Sqvs.f;
356 
357  Sqvs.f = Sch.f * Sqvs.f;
358  Sqvvx.f = Sch.f * Sqvvx.f;
359  Sqvvy.f = Sch.f * Sqvvy.f;
360  Sqvvz.f = Sch.f * Sqvvz.f;
361 
362  Sqvvx.f = __dadd_rn(Sqvvx.f, Ssh.f);
363  Sqvs.f = __dsub_rn(Sqvs.f, Stmp1.f);
364  Sqvvy.f = __dadd_rn(Sqvvy.f, Stmp3.f);
365  Sqvvz.f = __dsub_rn(Sqvvz.f, Stmp2.f);
366 
367 #ifdef DEBUG_JACOBI_CONJUGATE
368  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
369  Sqvs.f);
370 #endif
371 #if 1
373  // 1 -> 2
375 
376  Ssh.f = Ss31.f * 0.5f;
377  Stmp5.f = __dsub_rn(Ss33.f, Ss11.f);
378 
379  Stmp2.f = Ssh.f * Ssh.f;
380  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
381  Ssh.ui = Stmp1.ui & Ssh.ui;
382  Sch.ui = Stmp1.ui & Stmp5.ui;
383  Stmp2.ui = ~Stmp1.ui & gone;
384  Sch.ui = Sch.ui | Stmp2.ui;
385 
386  Stmp1.f = Ssh.f * Ssh.f;
387  Stmp2.f = Sch.f * Sch.f;
388  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
389  Stmp4.f = __drsqrt_rn(Stmp3.f);
390 
391  Ssh.f = Stmp4.f * Ssh.f;
392  Sch.f = Stmp4.f * Sch.f;
393  Stmp1.f = gfour_gamma_squared * Stmp1.f;
394  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
395 
396  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
397  Ssh.ui = ~Stmp1.ui & Ssh.ui;
398  Ssh.ui = Ssh.ui | Stmp2.ui;
399  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
400  Sch.ui = ~Stmp1.ui & Sch.ui;
401  Sch.ui = Sch.ui | Stmp2.ui;
402 
403  Stmp1.f = Ssh.f * Ssh.f;
404  Stmp2.f = Sch.f * Sch.f;
405  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
406  Ss.f = Sch.f * Ssh.f;
407  Ss.f = __dadd_rn(Ss.f, Ss.f);
408 
409 #ifdef DEBUG_JACOBI_CONJUGATE
410  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
411  Sch.f);
412 #endif
413 
414  // ###########################################################
415  // Perform the actual Givens conjugation
416  // ###########################################################
417 
418  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
419  Ss22.f = Ss22.f * Stmp3.f;
420  Ss32.f = Ss32.f * Stmp3.f;
421  Ss21.f = Ss21.f * Stmp3.f;
422  Ss22.f = Ss22.f * Stmp3.f;
423 
424  Stmp1.f = Ss.f * Ss32.f;
425  Stmp2.f = Ss.f * Ss21.f;
426  Ss32.f = Sc.f * Ss32.f;
427  Ss21.f = Sc.f * Ss21.f;
428  Ss32.f = __dadd_rn(Stmp2.f, Ss32.f);
429  Ss21.f = __dsub_rn(Ss21.f, Stmp1.f);
430 
431  Stmp2.f = Ss.f * Ss.f;
432  Stmp1.f = Ss11.f * Stmp2.f;
433  Stmp3.f = Ss33.f * Stmp2.f;
434  Stmp4.f = Sc.f * Sc.f;
435  Ss33.f = Ss33.f * Stmp4.f;
436  Ss11.f = Ss11.f * Stmp4.f;
437  Ss33.f = __dadd_rn(Ss33.f, Stmp1.f);
438  Ss11.f = __dadd_rn(Ss11.f, Stmp3.f);
439  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
440  Stmp2.f = __dadd_rn(Ss31.f, Ss31.f);
441  Ss31.f = Ss31.f * Stmp4.f;
442  Stmp4.f = Sc.f * Ss.f;
443  Stmp2.f = Stmp2.f * Stmp4.f;
444  Stmp5.f = Stmp5.f * Stmp4.f;
445  Ss33.f = __dadd_rn(Ss33.f, Stmp2.f);
446  Ss31.f = __dsub_rn(Ss31.f, Stmp5.f);
447  Ss11.f = __dsub_rn(Ss11.f, Stmp2.f);
448 
449 #ifdef DEBUG_JACOBI_CONJUGATE
450  printf("%.20g\n", Ss11.f);
451  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
452  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
453 #endif
454 
455  // ###########################################################
456  // Compute the cumulative rotation, in quaternion form
457  // ###########################################################
458 
459  Stmp1.f = Ssh.f * Sqvvx.f;
460  Stmp2.f = Ssh.f * Sqvvy.f;
461  Stmp3.f = Ssh.f * Sqvvz.f;
462  Ssh.f = Ssh.f * Sqvs.f;
463 
464  Sqvs.f = Sch.f * Sqvs.f;
465  Sqvvx.f = Sch.f * Sqvvx.f;
466  Sqvvy.f = Sch.f * Sqvvy.f;
467  Sqvvz.f = Sch.f * Sqvvz.f;
468 
469  Sqvvy.f = __dadd_rn(Sqvvy.f, Ssh.f);
470  Sqvs.f = __dsub_rn(Sqvs.f, Stmp2.f);
471  Sqvvz.f = __dadd_rn(Sqvvz.f, Stmp1.f);
472  Sqvvx.f = __dsub_rn(Sqvvx.f, Stmp3.f);
473 #endif
474  }
475 
476  // ###########################################################
477  // Normalize quaternion for matrix V
478  // ###########################################################
479 
480  Stmp2.f = Sqvs.f * Sqvs.f;
481  Stmp1.f = Sqvvx.f * Sqvvx.f;
482  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
483  Stmp1.f = Sqvvy.f * Sqvvy.f;
484  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
485  Stmp1.f = Sqvvz.f * Sqvvz.f;
486  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
487 
488  Stmp1.f = __drsqrt_rn(Stmp2.f);
489  Stmp4.f = Stmp1.f * 0.5f;
490  Stmp3.f = Stmp1.f * Stmp4.f;
491  Stmp3.f = Stmp1.f * Stmp3.f;
492  Stmp3.f = Stmp2.f * Stmp3.f;
493  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
494  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
495 
496  Sqvs.f = Sqvs.f * Stmp1.f;
497  Sqvvx.f = Sqvvx.f * Stmp1.f;
498  Sqvvy.f = Sqvvy.f * Stmp1.f;
499  Sqvvz.f = Sqvvz.f * Stmp1.f;
500 
501  // ###########################################################
502  // Transform quaternion to matrix V
503  // ###########################################################
504 
505  Stmp1.f = Sqvvx.f * Sqvvx.f;
506  Stmp2.f = Sqvvy.f * Sqvvy.f;
507  Stmp3.f = Sqvvz.f * Sqvvz.f;
508  Sv11.f = Sqvs.f * Sqvs.f;
509  Sv22.f = __dsub_rn(Sv11.f, Stmp1.f);
510  Sv33.f = __dsub_rn(Sv22.f, Stmp2.f);
511  Sv33.f = __dadd_rn(Sv33.f, Stmp3.f);
512  Sv22.f = __dadd_rn(Sv22.f, Stmp2.f);
513  Sv22.f = __dsub_rn(Sv22.f, Stmp3.f);
514  Sv11.f = __dadd_rn(Sv11.f, Stmp1.f);
515  Sv11.f = __dsub_rn(Sv11.f, Stmp2.f);
516  Sv11.f = __dsub_rn(Sv11.f, Stmp3.f);
517  Stmp1.f = __dadd_rn(Sqvvx.f, Sqvvx.f);
518  Stmp2.f = __dadd_rn(Sqvvy.f, Sqvvy.f);
519  Stmp3.f = __dadd_rn(Sqvvz.f, Sqvvz.f);
520  Sv32.f = Sqvs.f * Stmp1.f;
521  Sv13.f = Sqvs.f * Stmp2.f;
522  Sv21.f = Sqvs.f * Stmp3.f;
523  Stmp1.f = Sqvvy.f * Stmp1.f;
524  Stmp2.f = Sqvvz.f * Stmp2.f;
525  Stmp3.f = Sqvvx.f * Stmp3.f;
526  Sv12.f = __dsub_rn(Stmp1.f, Sv21.f);
527  Sv23.f = __dsub_rn(Stmp2.f, Sv32.f);
528  Sv31.f = __dsub_rn(Stmp3.f, Sv13.f);
529  Sv21.f = __dadd_rn(Stmp1.f, Sv21.f);
530  Sv32.f = __dadd_rn(Stmp2.f, Sv32.f);
531  Sv13.f = __dadd_rn(Stmp3.f, Sv13.f);
532 
534  // Multiply (from the right) with V
535  // ###########################################################
536 
537  Stmp2.f = Sa12.f;
538  Stmp3.f = Sa13.f;
539  Sa12.f = Sv12.f * Sa11.f;
540  Sa13.f = Sv13.f * Sa11.f;
541  Sa11.f = Sv11.f * Sa11.f;
542  Stmp1.f = Sv21.f * Stmp2.f;
543  Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
544  Stmp1.f = Sv31.f * Stmp3.f;
545  Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
546  Stmp1.f = Sv22.f * Stmp2.f;
547  Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
548  Stmp1.f = Sv32.f * Stmp3.f;
549  Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
550  Stmp1.f = Sv23.f * Stmp2.f;
551  Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
552  Stmp1.f = Sv33.f * Stmp3.f;
553  Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
554 
555  Stmp2.f = Sa22.f;
556  Stmp3.f = Sa23.f;
557  Sa22.f = Sv12.f * Sa21.f;
558  Sa23.f = Sv13.f * Sa21.f;
559  Sa21.f = Sv11.f * Sa21.f;
560  Stmp1.f = Sv21.f * Stmp2.f;
561  Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
562  Stmp1.f = Sv31.f * Stmp3.f;
563  Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
564  Stmp1.f = Sv22.f * Stmp2.f;
565  Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
566  Stmp1.f = Sv32.f * Stmp3.f;
567  Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
568  Stmp1.f = Sv23.f * Stmp2.f;
569  Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
570  Stmp1.f = Sv33.f * Stmp3.f;
571  Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
572 
573  Stmp2.f = Sa32.f;
574  Stmp3.f = Sa33.f;
575  Sa32.f = Sv12.f * Sa31.f;
576  Sa33.f = Sv13.f * Sa31.f;
577  Sa31.f = Sv11.f * Sa31.f;
578  Stmp1.f = Sv21.f * Stmp2.f;
579  Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
580  Stmp1.f = Sv31.f * Stmp3.f;
581  Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
582  Stmp1.f = Sv22.f * Stmp2.f;
583  Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
584  Stmp1.f = Sv32.f * Stmp3.f;
585  Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
586  Stmp1.f = Sv23.f * Stmp2.f;
587  Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
588  Stmp1.f = Sv33.f * Stmp3.f;
589  Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
590 
591  // ###########################################################
592  // Permute columns such that the singular values are sorted
593  // ###########################################################
594 
595  Stmp1.f = Sa11.f * Sa11.f;
596  Stmp4.f = Sa21.f * Sa21.f;
597  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
598  Stmp4.f = Sa31.f * Sa31.f;
599  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
600 
601  Stmp2.f = Sa12.f * Sa12.f;
602  Stmp4.f = Sa22.f * Sa22.f;
603  Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
604  Stmp4.f = Sa32.f * Sa32.f;
605  Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
606 
607  Stmp3.f = Sa13.f * Sa13.f;
608  Stmp4.f = Sa23.f * Sa23.f;
609  Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
610  Stmp4.f = Sa33.f * Sa33.f;
611  Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
612 
613  // Swap columns 1-2 if necessary
614 
615  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
616  Stmp5.ui = Sa11.ui ^ Sa12.ui;
617  Stmp5.ui = Stmp5.ui & Stmp4.ui;
618  Sa11.ui = Sa11.ui ^ Stmp5.ui;
619  Sa12.ui = Sa12.ui ^ Stmp5.ui;
620 
621  Stmp5.ui = Sa21.ui ^ Sa22.ui;
622  Stmp5.ui = Stmp5.ui & Stmp4.ui;
623  Sa21.ui = Sa21.ui ^ Stmp5.ui;
624  Sa22.ui = Sa22.ui ^ Stmp5.ui;
625 
626  Stmp5.ui = Sa31.ui ^ Sa32.ui;
627  Stmp5.ui = Stmp5.ui & Stmp4.ui;
628  Sa31.ui = Sa31.ui ^ Stmp5.ui;
629  Sa32.ui = Sa32.ui ^ Stmp5.ui;
630 
631  Stmp5.ui = Sv11.ui ^ Sv12.ui;
632  Stmp5.ui = Stmp5.ui & Stmp4.ui;
633  Sv11.ui = Sv11.ui ^ Stmp5.ui;
634  Sv12.ui = Sv12.ui ^ Stmp5.ui;
635 
636  Stmp5.ui = Sv21.ui ^ Sv22.ui;
637  Stmp5.ui = Stmp5.ui & Stmp4.ui;
638  Sv21.ui = Sv21.ui ^ Stmp5.ui;
639  Sv22.ui = Sv22.ui ^ Stmp5.ui;
640 
641  Stmp5.ui = Sv31.ui ^ Sv32.ui;
642  Stmp5.ui = Stmp5.ui & Stmp4.ui;
643  Sv31.ui = Sv31.ui ^ Stmp5.ui;
644  Sv32.ui = Sv32.ui ^ Stmp5.ui;
645 
646  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
647  Stmp5.ui = Stmp5.ui & Stmp4.ui;
648  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
649  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
650 
651  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
652  // is still a rotation
653 
654  Stmp5.f = -2.f;
655  Stmp5.ui = Stmp5.ui & Stmp4.ui;
656  Stmp4.f = 1.f;
657  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
658 
659  Sa12.f = Sa12.f * Stmp4.f;
660  Sa22.f = Sa22.f * Stmp4.f;
661  Sa32.f = Sa32.f * Stmp4.f;
662 
663  Sv12.f = Sv12.f * Stmp4.f;
664  Sv22.f = Sv22.f * Stmp4.f;
665  Sv32.f = Sv32.f * Stmp4.f;
666 
667  // Swap columns 1-3 if necessary
668 
669  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
670  Stmp5.ui = Sa11.ui ^ Sa13.ui;
671  Stmp5.ui = Stmp5.ui & Stmp4.ui;
672  Sa11.ui = Sa11.ui ^ Stmp5.ui;
673  Sa13.ui = Sa13.ui ^ Stmp5.ui;
674 
675  Stmp5.ui = Sa21.ui ^ Sa23.ui;
676  Stmp5.ui = Stmp5.ui & Stmp4.ui;
677  Sa21.ui = Sa21.ui ^ Stmp5.ui;
678  Sa23.ui = Sa23.ui ^ Stmp5.ui;
679 
680  Stmp5.ui = Sa31.ui ^ Sa33.ui;
681  Stmp5.ui = Stmp5.ui & Stmp4.ui;
682  Sa31.ui = Sa31.ui ^ Stmp5.ui;
683  Sa33.ui = Sa33.ui ^ Stmp5.ui;
684 
685  Stmp5.ui = Sv11.ui ^ Sv13.ui;
686  Stmp5.ui = Stmp5.ui & Stmp4.ui;
687  Sv11.ui = Sv11.ui ^ Stmp5.ui;
688  Sv13.ui = Sv13.ui ^ Stmp5.ui;
689 
690  Stmp5.ui = Sv21.ui ^ Sv23.ui;
691  Stmp5.ui = Stmp5.ui & Stmp4.ui;
692  Sv21.ui = Sv21.ui ^ Stmp5.ui;
693  Sv23.ui = Sv23.ui ^ Stmp5.ui;
694 
695  Stmp5.ui = Sv31.ui ^ Sv33.ui;
696  Stmp5.ui = Stmp5.ui & Stmp4.ui;
697  Sv31.ui = Sv31.ui ^ Stmp5.ui;
698  Sv33.ui = Sv33.ui ^ Stmp5.ui;
699 
700  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
701  Stmp5.ui = Stmp5.ui & Stmp4.ui;
702  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
703  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
704 
705  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
706  // is still a rotation
707 
708  Stmp5.f = -2.f;
709  Stmp5.ui = Stmp5.ui & Stmp4.ui;
710  Stmp4.f = 1.f;
711  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
712 
713  Sa11.f = Sa11.f * Stmp4.f;
714  Sa21.f = Sa21.f * Stmp4.f;
715  Sa31.f = Sa31.f * Stmp4.f;
716 
717  Sv11.f = Sv11.f * Stmp4.f;
718  Sv21.f = Sv21.f * Stmp4.f;
719  Sv31.f = Sv31.f * Stmp4.f;
720 
721  // Swap columns 2-3 if necessary
722 
723  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
724  Stmp5.ui = Sa12.ui ^ Sa13.ui;
725  Stmp5.ui = Stmp5.ui & Stmp4.ui;
726  Sa12.ui = Sa12.ui ^ Stmp5.ui;
727  Sa13.ui = Sa13.ui ^ Stmp5.ui;
728 
729  Stmp5.ui = Sa22.ui ^ Sa23.ui;
730  Stmp5.ui = Stmp5.ui & Stmp4.ui;
731  Sa22.ui = Sa22.ui ^ Stmp5.ui;
732  Sa23.ui = Sa23.ui ^ Stmp5.ui;
733 
734  Stmp5.ui = Sa32.ui ^ Sa33.ui;
735  Stmp5.ui = Stmp5.ui & Stmp4.ui;
736  Sa32.ui = Sa32.ui ^ Stmp5.ui;
737  Sa33.ui = Sa33.ui ^ Stmp5.ui;
738 
739  Stmp5.ui = Sv12.ui ^ Sv13.ui;
740  Stmp5.ui = Stmp5.ui & Stmp4.ui;
741  Sv12.ui = Sv12.ui ^ Stmp5.ui;
742  Sv13.ui = Sv13.ui ^ Stmp5.ui;
743 
744  Stmp5.ui = Sv22.ui ^ Sv23.ui;
745  Stmp5.ui = Stmp5.ui & Stmp4.ui;
746  Sv22.ui = Sv22.ui ^ Stmp5.ui;
747  Sv23.ui = Sv23.ui ^ Stmp5.ui;
748 
749  Stmp5.ui = Sv32.ui ^ Sv33.ui;
750  Stmp5.ui = Stmp5.ui & Stmp4.ui;
751  Sv32.ui = Sv32.ui ^ Stmp5.ui;
752  Sv33.ui = Sv33.ui ^ Stmp5.ui;
753 
754  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
755  Stmp5.ui = Stmp5.ui & Stmp4.ui;
756  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
757  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
758 
759  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
760  // is still a rotation
761 
762  Stmp5.f = -2.f;
763  Stmp5.ui = Stmp5.ui & Stmp4.ui;
764  Stmp4.f = 1.f;
765  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
766 
767  Sa13.f = Sa13.f * Stmp4.f;
768  Sa23.f = Sa23.f * Stmp4.f;
769  Sa33.f = Sa33.f * Stmp4.f;
770 
771  Sv13.f = Sv13.f * Stmp4.f;
772  Sv23.f = Sv23.f * Stmp4.f;
773  Sv33.f = Sv33.f * Stmp4.f;
774 
775  // ###########################################################
776  // Construct QR factorization of A*V (=U*D) using Givens rotations
777  // ###########################################################
778 
779  Su11.f = 1.f;
780  Su12.f = 0.f;
781  Su13.f = 0.f;
782  Su21.f = 0.f;
783  Su22.f = 1.f;
784  Su23.f = 0.f;
785  Su31.f = 0.f;
786  Su32.f = 0.f;
787  Su33.f = 1.f;
788 
789  Ssh.f = Sa21.f * Sa21.f;
790  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
791  Ssh.ui = Ssh.ui & Sa21.ui;
792 
793  Stmp5.f = 0.f;
794  Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
795  Sch.f = max(Sch.f, Sa11.f);
796  Sch.f = max(Sch.f, gsmall_number);
797  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
798 
799  Stmp1.f = Sch.f * Sch.f;
800  Stmp2.f = Ssh.f * Ssh.f;
801  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
802  Stmp1.f = __drsqrt_rn(Stmp2.f);
803 
804  Stmp4.f = Stmp1.f * 0.5f;
805  Stmp3.f = Stmp1.f * Stmp4.f;
806  Stmp3.f = Stmp1.f * Stmp3.f;
807  Stmp3.f = Stmp2.f * Stmp3.f;
808  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
809  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
810  Stmp1.f = Stmp1.f * Stmp2.f;
811 
812  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
813 
814  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
815  Stmp2.ui = ~Stmp5.ui & Sch.ui;
816  Sch.ui = Stmp5.ui & Sch.ui;
817  Ssh.ui = Stmp5.ui & Ssh.ui;
818  Sch.ui = Sch.ui | Stmp1.ui;
819  Ssh.ui = Ssh.ui | Stmp2.ui;
820 
821  Stmp1.f = Sch.f * Sch.f;
822  Stmp2.f = Ssh.f * Ssh.f;
823  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
824  Stmp1.f = __drsqrt_rn(Stmp2.f);
825 
826  Stmp4.f = Stmp1.f * 0.5f;
827  Stmp3.f = Stmp1.f * Stmp4.f;
828  Stmp3.f = Stmp1.f * Stmp3.f;
829  Stmp3.f = Stmp2.f * Stmp3.f;
830  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
831  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
832 
833  Sch.f = Sch.f * Stmp1.f;
834  Ssh.f = Ssh.f * Stmp1.f;
835 
836  Sc.f = Sch.f * Sch.f;
837  Ss.f = Ssh.f * Ssh.f;
838  Sc.f = __dsub_rn(Sc.f, Ss.f);
839  Ss.f = Ssh.f * Sch.f;
840  Ss.f = __dadd_rn(Ss.f, Ss.f);
841 
842  // ###########################################################
843  // Rotate matrix A
844  // ###########################################################
845 
846  Stmp1.f = Ss.f * Sa11.f;
847  Stmp2.f = Ss.f * Sa21.f;
848  Sa11.f = Sc.f * Sa11.f;
849  Sa21.f = Sc.f * Sa21.f;
850  Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
851  Sa21.f = __dsub_rn(Sa21.f, Stmp1.f);
852 
853  Stmp1.f = Ss.f * Sa12.f;
854  Stmp2.f = Ss.f * Sa22.f;
855  Sa12.f = Sc.f * Sa12.f;
856  Sa22.f = Sc.f * Sa22.f;
857  Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
858  Sa22.f = __dsub_rn(Sa22.f, Stmp1.f);
859 
860  Stmp1.f = Ss.f * Sa13.f;
861  Stmp2.f = Ss.f * Sa23.f;
862  Sa13.f = Sc.f * Sa13.f;
863  Sa23.f = Sc.f * Sa23.f;
864  Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
865  Sa23.f = __dsub_rn(Sa23.f, Stmp1.f);
866 
867  // ###########################################################
868  // Update matrix U
869  // ###########################################################
870 
871  Stmp1.f = Ss.f * Su11.f;
872  Stmp2.f = Ss.f * Su12.f;
873  Su11.f = Sc.f * Su11.f;
874  Su12.f = Sc.f * Su12.f;
875  Su11.f = __dadd_rn(Su11.f, Stmp2.f);
876  Su12.f = __dsub_rn(Su12.f, Stmp1.f);
877 
878  Stmp1.f = Ss.f * Su21.f;
879  Stmp2.f = Ss.f * Su22.f;
880  Su21.f = Sc.f * Su21.f;
881  Su22.f = Sc.f * Su22.f;
882  Su21.f = __dadd_rn(Su21.f, Stmp2.f);
883  Su22.f = __dsub_rn(Su22.f, Stmp1.f);
884 
885  Stmp1.f = Ss.f * Su31.f;
886  Stmp2.f = Ss.f * Su32.f;
887  Su31.f = Sc.f * Su31.f;
888  Su32.f = Sc.f * Su32.f;
889  Su31.f = __dadd_rn(Su31.f, Stmp2.f);
890  Su32.f = __dsub_rn(Su32.f, Stmp1.f);
891 
892  // Second Givens rotation
893 
894  Ssh.f = Sa31.f * Sa31.f;
895  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
896  Ssh.ui = Ssh.ui & Sa31.ui;
897 
898  Stmp5.f = 0.f;
899  Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
900  Sch.f = max(Sch.f, Sa11.f);
901  Sch.f = max(Sch.f, gsmall_number);
902  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
903 
904  Stmp1.f = Sch.f * Sch.f;
905  Stmp2.f = Ssh.f * Ssh.f;
906  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
907  Stmp1.f = __drsqrt_rn(Stmp2.f);
908 
909  Stmp4.f = Stmp1.f * 0.5;
910  Stmp3.f = Stmp1.f * Stmp4.f;
911  Stmp3.f = Stmp1.f * Stmp3.f;
912  Stmp3.f = Stmp2.f * Stmp3.f;
913  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
914  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
915  Stmp1.f = Stmp1.f * Stmp2.f;
916 
917  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
918 
919  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
920  Stmp2.ui = ~Stmp5.ui & Sch.ui;
921  Sch.ui = Stmp5.ui & Sch.ui;
922  Ssh.ui = Stmp5.ui & Ssh.ui;
923  Sch.ui = Sch.ui | Stmp1.ui;
924  Ssh.ui = Ssh.ui | Stmp2.ui;
925 
926  Stmp1.f = Sch.f * Sch.f;
927  Stmp2.f = Ssh.f * Ssh.f;
928  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
929  Stmp1.f = __drsqrt_rn(Stmp2.f);
930 
931  Stmp4.f = Stmp1.f * 0.5f;
932  Stmp3.f = Stmp1.f * Stmp4.f;
933  Stmp3.f = Stmp1.f * Stmp3.f;
934  Stmp3.f = Stmp2.f * Stmp3.f;
935  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
936  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
937 
938  Sch.f = Sch.f * Stmp1.f;
939  Ssh.f = Ssh.f * Stmp1.f;
940 
941  Sc.f = Sch.f * Sch.f;
942  Ss.f = Ssh.f * Ssh.f;
943  Sc.f = __dsub_rn(Sc.f, Ss.f);
944  Ss.f = Ssh.f * Sch.f;
945  Ss.f = __dadd_rn(Ss.f, Ss.f);
946 
947  // ###########################################################
948  // Rotate matrix A
949  // ###########################################################
950 
951  Stmp1.f = Ss.f * Sa11.f;
952  Stmp2.f = Ss.f * Sa31.f;
953  Sa11.f = Sc.f * Sa11.f;
954  Sa31.f = Sc.f * Sa31.f;
955  Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
956  Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
957 
958  Stmp1.f = Ss.f * Sa12.f;
959  Stmp2.f = Ss.f * Sa32.f;
960  Sa12.f = Sc.f * Sa12.f;
961  Sa32.f = Sc.f * Sa32.f;
962  Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
963  Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
964 
965  Stmp1.f = Ss.f * Sa13.f;
966  Stmp2.f = Ss.f * Sa33.f;
967  Sa13.f = Sc.f * Sa13.f;
968  Sa33.f = Sc.f * Sa33.f;
969  Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
970  Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
971 
972  // ###########################################################
973  // Update matrix U
974  // ###########################################################
975 
976  Stmp1.f = Ss.f * Su11.f;
977  Stmp2.f = Ss.f * Su13.f;
978  Su11.f = Sc.f * Su11.f;
979  Su13.f = Sc.f * Su13.f;
980  Su11.f = __dadd_rn(Su11.f, Stmp2.f);
981  Su13.f = __dsub_rn(Su13.f, Stmp1.f);
982 
983  Stmp1.f = Ss.f * Su21.f;
984  Stmp2.f = Ss.f * Su23.f;
985  Su21.f = Sc.f * Su21.f;
986  Su23.f = Sc.f * Su23.f;
987  Su21.f = __dadd_rn(Su21.f, Stmp2.f);
988  Su23.f = __dsub_rn(Su23.f, Stmp1.f);
989 
990  Stmp1.f = Ss.f * Su31.f;
991  Stmp2.f = Ss.f * Su33.f;
992  Su31.f = Sc.f * Su31.f;
993  Su33.f = Sc.f * Su33.f;
994  Su31.f = __dadd_rn(Su31.f, Stmp2.f);
995  Su33.f = __dsub_rn(Su33.f, Stmp1.f);
996 
997  // Third Givens Rotation
998 
999  Ssh.f = Sa32.f * Sa32.f;
1000  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1001  Ssh.ui = Ssh.ui & Sa32.ui;
1002 
1003  Stmp5.f = 0.f;
1004  Sch.f = __dsub_rn(Stmp5.f, Sa22.f);
1005  Sch.f = max(Sch.f, Sa22.f);
1006  Sch.f = max(Sch.f, gsmall_number);
1007  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
1008 
1009  Stmp1.f = Sch.f * Sch.f;
1010  Stmp2.f = Ssh.f * Ssh.f;
1011  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1012  Stmp1.f = __drsqrt_rn(Stmp2.f);
1013 
1014  Stmp4.f = Stmp1.f * 0.5f;
1015  Stmp3.f = Stmp1.f * Stmp4.f;
1016  Stmp3.f = Stmp1.f * Stmp3.f;
1017  Stmp3.f = Stmp2.f * Stmp3.f;
1018  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1019  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1020  Stmp1.f = Stmp1.f * Stmp2.f;
1021 
1022  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
1023 
1024  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1025  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1026  Sch.ui = Stmp5.ui & Sch.ui;
1027  Ssh.ui = Stmp5.ui & Ssh.ui;
1028  Sch.ui = Sch.ui | Stmp1.ui;
1029  Ssh.ui = Ssh.ui | Stmp2.ui;
1030 
1031  Stmp1.f = Sch.f * Sch.f;
1032  Stmp2.f = Ssh.f * Ssh.f;
1033  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1034  Stmp1.f = __drsqrt_rn(Stmp2.f);
1035 
1036  Stmp4.f = Stmp1.f * 0.5f;
1037  Stmp3.f = Stmp1.f * Stmp4.f;
1038  Stmp3.f = Stmp1.f * Stmp3.f;
1039  Stmp3.f = Stmp2.f * Stmp3.f;
1040  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1041  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1042 
1043  Sch.f = Sch.f * Stmp1.f;
1044  Ssh.f = Ssh.f * Stmp1.f;
1045 
1046  Sc.f = Sch.f * Sch.f;
1047  Ss.f = Ssh.f * Ssh.f;
1048  Sc.f = __dsub_rn(Sc.f, Ss.f);
1049  Ss.f = Ssh.f * Sch.f;
1050  Ss.f = __dadd_rn(Ss.f, Ss.f);
1051 
1052  // ###########################################################
1053  // Rotate matrix A
1054  // ###########################################################
1055 
1056  Stmp1.f = Ss.f * Sa21.f;
1057  Stmp2.f = Ss.f * Sa31.f;
1058  Sa21.f = Sc.f * Sa21.f;
1059  Sa31.f = Sc.f * Sa31.f;
1060  Sa21.f = __dadd_rn(Sa21.f, Stmp2.f);
1061  Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
1062 
1063  Stmp1.f = Ss.f * Sa22.f;
1064  Stmp2.f = Ss.f * Sa32.f;
1065  Sa22.f = Sc.f * Sa22.f;
1066  Sa32.f = Sc.f * Sa32.f;
1067  Sa22.f = __dadd_rn(Sa22.f, Stmp2.f);
1068  Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
1069 
1070  Stmp1.f = Ss.f * Sa23.f;
1071  Stmp2.f = Ss.f * Sa33.f;
1072  Sa23.f = Sc.f * Sa23.f;
1073  Sa33.f = Sc.f * Sa33.f;
1074  Sa23.f = __dadd_rn(Sa23.f, Stmp2.f);
1075  Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
1076 
1077  // ###########################################################
1078  // Update matrix U
1079  // ###########################################################
1080 
1081  Stmp1.f = Ss.f * Su12.f;
1082  Stmp2.f = Ss.f * Su13.f;
1083  Su12.f = Sc.f * Su12.f;
1084  Su13.f = Sc.f * Su13.f;
1085  Su12.f = __dadd_rn(Su12.f, Stmp2.f);
1086  Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1087 
1088  Stmp1.f = Ss.f * Su22.f;
1089  Stmp2.f = Ss.f * Su23.f;
1090  Su22.f = Sc.f * Su22.f;
1091  Su23.f = Sc.f * Su23.f;
1092  Su22.f = __dadd_rn(Su22.f, Stmp2.f);
1093  Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1094 
1095  Stmp1.f = Ss.f * Su32.f;
1096  Stmp2.f = Ss.f * Su33.f;
1097  Su32.f = Sc.f * Su32.f;
1098  Su33.f = Sc.f * Su33.f;
1099  Su32.f = __dadd_rn(Su32.f, Stmp2.f);
1100  Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1101 
1102  V_3x3[0] = Sv11.f;
1103  V_3x3[1] = Sv12.f;
1104  V_3x3[2] = Sv13.f;
1105  V_3x3[3] = Sv21.f;
1106  V_3x3[4] = Sv22.f;
1107  V_3x3[5] = Sv23.f;
1108  V_3x3[6] = Sv31.f;
1109  V_3x3[7] = Sv32.f;
1110  V_3x3[8] = Sv33.f;
1111 
1112  U_3x3[0] = Su11.f;
1113  U_3x3[1] = Su12.f;
1114  U_3x3[2] = Su13.f;
1115  U_3x3[3] = Su21.f;
1116  U_3x3[4] = Su22.f;
1117  U_3x3[5] = Su23.f;
1118  U_3x3[6] = Su31.f;
1119  U_3x3[7] = Su32.f;
1120  U_3x3[8] = Su33.f;
1121 
1122  S_3x1[0] = Sa11.f;
1123  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1124  S_3x1[1] = Sa22.f;
1125  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1126  S_3x1[2] = Sa33.f;
1127 }
1128 
1129 template <>
1131  const float *A_3x3, float *U_3x3, float *S_3x1, float *V_3x3) {
1132  float gsmall_number = 1.e-12;
1133 
1134  un<float> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
1135  un<float> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
1136  un<float> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
1137  un<float> Sc, Ss, Sch, Ssh;
1138  un<float> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
1139  un<float> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
1140  un<float> Sqvs, Sqvvx, Sqvvy, Sqvvz;
1141 
1142  Sa11.f = A_3x3[0];
1143  Sa12.f = A_3x3[1];
1144  Sa13.f = A_3x3[2];
1145  Sa21.f = A_3x3[3];
1146  Sa22.f = A_3x3[4];
1147  Sa23.f = A_3x3[5];
1148  Sa31.f = A_3x3[6];
1149  Sa32.f = A_3x3[7];
1150  Sa33.f = A_3x3[8];
1151 
1152  // ###########################################################
1153  // Compute normal equations matrix
1154  // ###########################################################
1155 
1156  Ss11.f = Sa11.f * Sa11.f;
1157  Stmp1.f = Sa21.f * Sa21.f;
1158  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1159  Stmp1.f = Sa31.f * Sa31.f;
1160  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1161 
1162  Ss21.f = Sa12.f * Sa11.f;
1163  Stmp1.f = Sa22.f * Sa21.f;
1164  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1165  Stmp1.f = Sa32.f * Sa31.f;
1166  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1167 
1168  Ss31.f = Sa13.f * Sa11.f;
1169  Stmp1.f = Sa23.f * Sa21.f;
1170  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1171  Stmp1.f = Sa33.f * Sa31.f;
1172  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1173 
1174  Ss22.f = Sa12.f * Sa12.f;
1175  Stmp1.f = Sa22.f * Sa22.f;
1176  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1177  Stmp1.f = Sa32.f * Sa32.f;
1178  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1179 
1180  Ss32.f = Sa13.f * Sa12.f;
1181  Stmp1.f = Sa23.f * Sa22.f;
1182  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1183  Stmp1.f = Sa33.f * Sa32.f;
1184  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1185 
1186  Ss33.f = Sa13.f * Sa13.f;
1187  Stmp1.f = Sa23.f * Sa23.f;
1188  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1189  Stmp1.f = Sa33.f * Sa33.f;
1190  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1191 
1192  Sqvs.f = 1.f;
1193  Sqvvx.f = 0.f;
1194  Sqvvy.f = 0.f;
1195  Sqvvz.f = 0.f;
1196 
1197  // ###########################################################
1198  // Solve symmetric eigenproblem using Jacobi iteration
1199  // ###########################################################
1200  for (int i = 0; i < 4; i++) {
1201  Ssh.f = Ss21.f * 0.5f;
1202  Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
1203 
1204  Stmp2.f = Ssh.f * Ssh.f;
1205  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1206  Ssh.ui = Stmp1.ui & Ssh.ui;
1207  Sch.ui = Stmp1.ui & Stmp5.ui;
1208  Stmp2.ui = ~Stmp1.ui & gone;
1209  Sch.ui = Sch.ui | Stmp2.ui;
1210 
1211  Stmp1.f = Ssh.f * Ssh.f;
1212  Stmp2.f = Sch.f * Sch.f;
1213  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1214  Stmp4.f = __frsqrt_rn(Stmp3.f);
1215 
1216  Ssh.f = Stmp4.f * Ssh.f;
1217  Sch.f = Stmp4.f * Sch.f;
1218  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1219  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1220 
1221  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1222  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1223  Ssh.ui = Ssh.ui | Stmp2.ui;
1224  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1225  Sch.ui = ~Stmp1.ui & Sch.ui;
1226  Sch.ui = Sch.ui | Stmp2.ui;
1227 
1228  Stmp1.f = Ssh.f * Ssh.f;
1229  Stmp2.f = Sch.f * Sch.f;
1230  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1231  Ss.f = Sch.f * Ssh.f;
1232  Ss.f = __fadd_rn(Ss.f, Ss.f);
1233 
1234 #ifdef DEBUG_JACOBI_CONJUGATE
1235  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1236  Sch.f);
1237 #endif
1238  // ###########################################################
1239  // Perform the actual Givens conjugation
1240  // ###########################################################
1241 
1242  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1243  Ss33.f = Ss33.f * Stmp3.f;
1244  Ss31.f = Ss31.f * Stmp3.f;
1245  Ss32.f = Ss32.f * Stmp3.f;
1246  Ss33.f = Ss33.f * Stmp3.f;
1247 
1248  Stmp1.f = Ss.f * Ss31.f;
1249  Stmp2.f = Ss.f * Ss32.f;
1250  Ss31.f = Sc.f * Ss31.f;
1251  Ss32.f = Sc.f * Ss32.f;
1252  Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
1253  Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
1254 
1255  Stmp2.f = Ss.f * Ss.f;
1256  Stmp1.f = Ss22.f * Stmp2.f;
1257  Stmp3.f = Ss11.f * Stmp2.f;
1258  Stmp4.f = Sc.f * Sc.f;
1259  Ss11.f = Ss11.f * Stmp4.f;
1260  Ss22.f = Ss22.f * Stmp4.f;
1261  Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
1262  Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
1263  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1264  Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
1265  Ss21.f = Ss21.f * Stmp4.f;
1266  Stmp4.f = Sc.f * Ss.f;
1267  Stmp2.f = Stmp2.f * Stmp4.f;
1268  Stmp5.f = Stmp5.f * Stmp4.f;
1269  Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
1270  Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
1271  Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
1272 
1273 #ifdef DEBUG_JACOBI_CONJUGATE
1274  printf("%.20g\n", Ss11.f);
1275  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1276  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1277 #endif
1278 
1279  // ###########################################################
1280  // Compute the cumulative rotation, in quaternion form
1281  // ###########################################################
1282 
1283  Stmp1.f = Ssh.f * Sqvvx.f;
1284  Stmp2.f = Ssh.f * Sqvvy.f;
1285  Stmp3.f = Ssh.f * Sqvvz.f;
1286  Ssh.f = Ssh.f * Sqvs.f;
1287 
1288  Sqvs.f = Sch.f * Sqvs.f;
1289  Sqvvx.f = Sch.f * Sqvvx.f;
1290  Sqvvy.f = Sch.f * Sqvvy.f;
1291  Sqvvz.f = Sch.f * Sqvvz.f;
1292 
1293  Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
1294  Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
1295  Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
1296  Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
1297 
1298 #ifdef DEBUG_JACOBI_CONJUGATE
1299  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1300  Sqvs.f);
1301 #endif
1302 
1304  // (1->3)
1306  Ssh.f = Ss32.f * 0.5f;
1307  Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
1308 
1309  Stmp2.f = Ssh.f * Ssh.f;
1310  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1311  Ssh.ui = Stmp1.ui & Ssh.ui;
1312  Sch.ui = Stmp1.ui & Stmp5.ui;
1313  Stmp2.ui = ~Stmp1.ui & gone;
1314  Sch.ui = Sch.ui | Stmp2.ui;
1315 
1316  Stmp1.f = Ssh.f * Ssh.f;
1317  Stmp2.f = Sch.f * Sch.f;
1318  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1319  Stmp4.f = __frsqrt_rn(Stmp3.f);
1320 
1321  Ssh.f = Stmp4.f * Ssh.f;
1322  Sch.f = Stmp4.f * Sch.f;
1323  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1324  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1325 
1326  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1327  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1328  Ssh.ui = Ssh.ui | Stmp2.ui;
1329  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1330  Sch.ui = ~Stmp1.ui & Sch.ui;
1331  Sch.ui = Sch.ui | Stmp2.ui;
1332 
1333  Stmp1.f = Ssh.f * Ssh.f;
1334  Stmp2.f = Sch.f * Sch.f;
1335  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1336  Ss.f = Sch.f * Ssh.f;
1337  Ss.f = __fadd_rn(Ss.f, Ss.f);
1338 
1339 #ifdef DEBUG_JACOBI_CONJUGATE
1340  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1341  Sch.f);
1342 #endif
1343 
1344  // ###########################################################
1345  // Perform the actual Givens conjugation
1346  // ###########################################################
1347 
1348  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1349  Ss11.f = Ss11.f * Stmp3.f;
1350  Ss21.f = Ss21.f * Stmp3.f;
1351  Ss31.f = Ss31.f * Stmp3.f;
1352  Ss11.f = Ss11.f * Stmp3.f;
1353 
1354  Stmp1.f = Ss.f * Ss21.f;
1355  Stmp2.f = Ss.f * Ss31.f;
1356  Ss21.f = Sc.f * Ss21.f;
1357  Ss31.f = Sc.f * Ss31.f;
1358  Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
1359  Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
1360 
1361  Stmp2.f = Ss.f * Ss.f;
1362  Stmp1.f = Ss33.f * Stmp2.f;
1363  Stmp3.f = Ss22.f * Stmp2.f;
1364  Stmp4.f = Sc.f * Sc.f;
1365  Ss22.f = Ss22.f * Stmp4.f;
1366  Ss33.f = Ss33.f * Stmp4.f;
1367  Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
1368  Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
1369  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1370  Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
1371  Ss32.f = Ss32.f * Stmp4.f;
1372  Stmp4.f = Sc.f * Ss.f;
1373  Stmp2.f = Stmp2.f * Stmp4.f;
1374  Stmp5.f = Stmp5.f * Stmp4.f;
1375  Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
1376  Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
1377  Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
1378 
1379 #ifdef DEBUG_JACOBI_CONJUGATE
1380  printf("%.20g\n", Ss11.f);
1381  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1382  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1383 #endif
1384 
1385  // ###########################################################
1386  // Compute the cumulative rotation, in quaternion form
1387  // ###########################################################
1388 
1389  Stmp1.f = Ssh.f * Sqvvx.f;
1390  Stmp2.f = Ssh.f * Sqvvy.f;
1391  Stmp3.f = Ssh.f * Sqvvz.f;
1392  Ssh.f = Ssh.f * Sqvs.f;
1393 
1394  Sqvs.f = Sch.f * Sqvs.f;
1395  Sqvvx.f = Sch.f * Sqvvx.f;
1396  Sqvvy.f = Sch.f * Sqvvy.f;
1397  Sqvvz.f = Sch.f * Sqvvz.f;
1398 
1399  Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
1400  Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
1401  Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
1402  Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
1403 
1404 #ifdef DEBUG_JACOBI_CONJUGATE
1405  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1406  Sqvs.f);
1407 #endif
1408 #if 1
1410  // 1 -> 2
1412 
1413  Ssh.f = Ss31.f * 0.5f;
1414  Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
1415 
1416  Stmp2.f = Ssh.f * Ssh.f;
1417  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1418  Ssh.ui = Stmp1.ui & Ssh.ui;
1419  Sch.ui = Stmp1.ui & Stmp5.ui;
1420  Stmp2.ui = ~Stmp1.ui & gone;
1421  Sch.ui = Sch.ui | Stmp2.ui;
1422 
1423  Stmp1.f = Ssh.f * Ssh.f;
1424  Stmp2.f = Sch.f * Sch.f;
1425  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1426  Stmp4.f = __frsqrt_rn(Stmp3.f);
1427 
1428  Ssh.f = Stmp4.f * Ssh.f;
1429  Sch.f = Stmp4.f * Sch.f;
1430  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1431  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1432 
1433  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1434  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1435  Ssh.ui = Ssh.ui | Stmp2.ui;
1436  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1437  Sch.ui = ~Stmp1.ui & Sch.ui;
1438  Sch.ui = Sch.ui | Stmp2.ui;
1439 
1440  Stmp1.f = Ssh.f * Ssh.f;
1441  Stmp2.f = Sch.f * Sch.f;
1442  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1443  Ss.f = Sch.f * Ssh.f;
1444  Ss.f = __fadd_rn(Ss.f, Ss.f);
1445 
1446 #ifdef DEBUG_JACOBI_CONJUGATE
1447  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1448  Sch.f);
1449 #endif
1450 
1451  // ###########################################################
1452  // Perform the actual Givens conjugation
1453  // ###########################################################
1454 
1455  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1456  Ss22.f = Ss22.f * Stmp3.f;
1457  Ss32.f = Ss32.f * Stmp3.f;
1458  Ss21.f = Ss21.f * Stmp3.f;
1459  Ss22.f = Ss22.f * Stmp3.f;
1460 
1461  Stmp1.f = Ss.f * Ss32.f;
1462  Stmp2.f = Ss.f * Ss21.f;
1463  Ss32.f = Sc.f * Ss32.f;
1464  Ss21.f = Sc.f * Ss21.f;
1465  Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
1466  Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
1467 
1468  Stmp2.f = Ss.f * Ss.f;
1469  Stmp1.f = Ss11.f * Stmp2.f;
1470  Stmp3.f = Ss33.f * Stmp2.f;
1471  Stmp4.f = Sc.f * Sc.f;
1472  Ss33.f = Ss33.f * Stmp4.f;
1473  Ss11.f = Ss11.f * Stmp4.f;
1474  Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
1475  Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
1476  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1477  Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
1478  Ss31.f = Ss31.f * Stmp4.f;
1479  Stmp4.f = Sc.f * Ss.f;
1480  Stmp2.f = Stmp2.f * Stmp4.f;
1481  Stmp5.f = Stmp5.f * Stmp4.f;
1482  Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
1483  Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
1484  Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
1485 
1486 #ifdef DEBUG_JACOBI_CONJUGATE
1487  printf("%.20g\n", Ss11.f);
1488  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1489  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1490 #endif
1491 
1492  // ###########################################################
1493  // Compute the cumulative rotation, in quaternion form
1494  // ###########################################################
1495 
1496  Stmp1.f = Ssh.f * Sqvvx.f;
1497  Stmp2.f = Ssh.f * Sqvvy.f;
1498  Stmp3.f = Ssh.f * Sqvvz.f;
1499  Ssh.f = Ssh.f * Sqvs.f;
1500 
1501  Sqvs.f = Sch.f * Sqvs.f;
1502  Sqvvx.f = Sch.f * Sqvvx.f;
1503  Sqvvy.f = Sch.f * Sqvvy.f;
1504  Sqvvz.f = Sch.f * Sqvvz.f;
1505 
1506  Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
1507  Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
1508  Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
1509  Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
1510 #endif
1511  }
1512 
1513  // ###########################################################
1514  // Normalize quaternion for matrix V
1515  // ###########################################################
1516 
1517  Stmp2.f = Sqvs.f * Sqvs.f;
1518  Stmp1.f = Sqvvx.f * Sqvvx.f;
1519  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1520  Stmp1.f = Sqvvy.f * Sqvvy.f;
1521  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1522  Stmp1.f = Sqvvz.f * Sqvvz.f;
1523  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1524 
1525  Stmp1.f = __frsqrt_rn(Stmp2.f);
1526  Stmp4.f = Stmp1.f * 0.5f;
1527  Stmp3.f = Stmp1.f * Stmp4.f;
1528  Stmp3.f = Stmp1.f * Stmp3.f;
1529  Stmp3.f = Stmp2.f * Stmp3.f;
1530  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1531  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1532 
1533  Sqvs.f = Sqvs.f * Stmp1.f;
1534  Sqvvx.f = Sqvvx.f * Stmp1.f;
1535  Sqvvy.f = Sqvvy.f * Stmp1.f;
1536  Sqvvz.f = Sqvvz.f * Stmp1.f;
1537 
1538  // ###########################################################
1539  // Transform quaternion to matrix V
1540  // ###########################################################
1541 
1542  Stmp1.f = Sqvvx.f * Sqvvx.f;
1543  Stmp2.f = Sqvvy.f * Sqvvy.f;
1544  Stmp3.f = Sqvvz.f * Sqvvz.f;
1545  Sv11.f = Sqvs.f * Sqvs.f;
1546  Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
1547  Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
1548  Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
1549  Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
1550  Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
1551  Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
1552  Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
1553  Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
1554  Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
1555  Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
1556  Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
1557  Sv32.f = Sqvs.f * Stmp1.f;
1558  Sv13.f = Sqvs.f * Stmp2.f;
1559  Sv21.f = Sqvs.f * Stmp3.f;
1560  Stmp1.f = Sqvvy.f * Stmp1.f;
1561  Stmp2.f = Sqvvz.f * Stmp2.f;
1562  Stmp3.f = Sqvvx.f * Stmp3.f;
1563  Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
1564  Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
1565  Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
1566  Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
1567  Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
1568  Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
1569 
1571  // Multiply (from the right) with V
1572  // ###########################################################
1573 
1574  Stmp2.f = Sa12.f;
1575  Stmp3.f = Sa13.f;
1576  Sa12.f = Sv12.f * Sa11.f;
1577  Sa13.f = Sv13.f * Sa11.f;
1578  Sa11.f = Sv11.f * Sa11.f;
1579  Stmp1.f = Sv21.f * Stmp2.f;
1580  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1581  Stmp1.f = Sv31.f * Stmp3.f;
1582  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1583  Stmp1.f = Sv22.f * Stmp2.f;
1584  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1585  Stmp1.f = Sv32.f * Stmp3.f;
1586  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1587  Stmp1.f = Sv23.f * Stmp2.f;
1588  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1589  Stmp1.f = Sv33.f * Stmp3.f;
1590  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1591 
1592  Stmp2.f = Sa22.f;
1593  Stmp3.f = Sa23.f;
1594  Sa22.f = Sv12.f * Sa21.f;
1595  Sa23.f = Sv13.f * Sa21.f;
1596  Sa21.f = Sv11.f * Sa21.f;
1597  Stmp1.f = Sv21.f * Stmp2.f;
1598  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1599  Stmp1.f = Sv31.f * Stmp3.f;
1600  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1601  Stmp1.f = Sv22.f * Stmp2.f;
1602  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1603  Stmp1.f = Sv32.f * Stmp3.f;
1604  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1605  Stmp1.f = Sv23.f * Stmp2.f;
1606  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1607  Stmp1.f = Sv33.f * Stmp3.f;
1608  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1609 
1610  Stmp2.f = Sa32.f;
1611  Stmp3.f = Sa33.f;
1612  Sa32.f = Sv12.f * Sa31.f;
1613  Sa33.f = Sv13.f * Sa31.f;
1614  Sa31.f = Sv11.f * Sa31.f;
1615  Stmp1.f = Sv21.f * Stmp2.f;
1616  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1617  Stmp1.f = Sv31.f * Stmp3.f;
1618  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1619  Stmp1.f = Sv22.f * Stmp2.f;
1620  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1621  Stmp1.f = Sv32.f * Stmp3.f;
1622  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1623  Stmp1.f = Sv23.f * Stmp2.f;
1624  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1625  Stmp1.f = Sv33.f * Stmp3.f;
1626  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1627 
1628  // ###########################################################
1629  // Permute columns such that the singular values are sorted
1630  // ###########################################################
1631 
1632  Stmp1.f = Sa11.f * Sa11.f;
1633  Stmp4.f = Sa21.f * Sa21.f;
1634  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1635  Stmp4.f = Sa31.f * Sa31.f;
1636  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1637 
1638  Stmp2.f = Sa12.f * Sa12.f;
1639  Stmp4.f = Sa22.f * Sa22.f;
1640  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1641  Stmp4.f = Sa32.f * Sa32.f;
1642  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1643 
1644  Stmp3.f = Sa13.f * Sa13.f;
1645  Stmp4.f = Sa23.f * Sa23.f;
1646  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1647  Stmp4.f = Sa33.f * Sa33.f;
1648  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1649 
1650  // Swap columns 1-2 if necessary
1651 
1652  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
1653  Stmp5.ui = Sa11.ui ^ Sa12.ui;
1654  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1655  Sa11.ui = Sa11.ui ^ Stmp5.ui;
1656  Sa12.ui = Sa12.ui ^ Stmp5.ui;
1657 
1658  Stmp5.ui = Sa21.ui ^ Sa22.ui;
1659  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1660  Sa21.ui = Sa21.ui ^ Stmp5.ui;
1661  Sa22.ui = Sa22.ui ^ Stmp5.ui;
1662 
1663  Stmp5.ui = Sa31.ui ^ Sa32.ui;
1664  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1665  Sa31.ui = Sa31.ui ^ Stmp5.ui;
1666  Sa32.ui = Sa32.ui ^ Stmp5.ui;
1667 
1668  Stmp5.ui = Sv11.ui ^ Sv12.ui;
1669  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1670  Sv11.ui = Sv11.ui ^ Stmp5.ui;
1671  Sv12.ui = Sv12.ui ^ Stmp5.ui;
1672 
1673  Stmp5.ui = Sv21.ui ^ Sv22.ui;
1674  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1675  Sv21.ui = Sv21.ui ^ Stmp5.ui;
1676  Sv22.ui = Sv22.ui ^ Stmp5.ui;
1677 
1678  Stmp5.ui = Sv31.ui ^ Sv32.ui;
1679  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1680  Sv31.ui = Sv31.ui ^ Stmp5.ui;
1681  Sv32.ui = Sv32.ui ^ Stmp5.ui;
1682 
1683  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
1684  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1685  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1686  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1687 
1688  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
1689  // is still a rotation
1690 
1691  Stmp5.f = -2.f;
1692  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1693  Stmp4.f = 1.f;
1694  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1695 
1696  Sa12.f = Sa12.f * Stmp4.f;
1697  Sa22.f = Sa22.f * Stmp4.f;
1698  Sa32.f = Sa32.f * Stmp4.f;
1699 
1700  Sv12.f = Sv12.f * Stmp4.f;
1701  Sv22.f = Sv22.f * Stmp4.f;
1702  Sv32.f = Sv32.f * Stmp4.f;
1703 
1704  // Swap columns 1-3 if necessary
1705 
1706  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
1707  Stmp5.ui = Sa11.ui ^ Sa13.ui;
1708  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1709  Sa11.ui = Sa11.ui ^ Stmp5.ui;
1710  Sa13.ui = Sa13.ui ^ Stmp5.ui;
1711 
1712  Stmp5.ui = Sa21.ui ^ Sa23.ui;
1713  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1714  Sa21.ui = Sa21.ui ^ Stmp5.ui;
1715  Sa23.ui = Sa23.ui ^ Stmp5.ui;
1716 
1717  Stmp5.ui = Sa31.ui ^ Sa33.ui;
1718  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1719  Sa31.ui = Sa31.ui ^ Stmp5.ui;
1720  Sa33.ui = Sa33.ui ^ Stmp5.ui;
1721 
1722  Stmp5.ui = Sv11.ui ^ Sv13.ui;
1723  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1724  Sv11.ui = Sv11.ui ^ Stmp5.ui;
1725  Sv13.ui = Sv13.ui ^ Stmp5.ui;
1726 
1727  Stmp5.ui = Sv21.ui ^ Sv23.ui;
1728  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1729  Sv21.ui = Sv21.ui ^ Stmp5.ui;
1730  Sv23.ui = Sv23.ui ^ Stmp5.ui;
1731 
1732  Stmp5.ui = Sv31.ui ^ Sv33.ui;
1733  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1734  Sv31.ui = Sv31.ui ^ Stmp5.ui;
1735  Sv33.ui = Sv33.ui ^ Stmp5.ui;
1736 
1737  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
1738  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1739  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1740  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1741 
1742  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
1743  // is still a rotation
1744 
1745  Stmp5.f = -2.f;
1746  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1747  Stmp4.f = 1.f;
1748  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1749 
1750  Sa11.f = Sa11.f * Stmp4.f;
1751  Sa21.f = Sa21.f * Stmp4.f;
1752  Sa31.f = Sa31.f * Stmp4.f;
1753 
1754  Sv11.f = Sv11.f * Stmp4.f;
1755  Sv21.f = Sv21.f * Stmp4.f;
1756  Sv31.f = Sv31.f * Stmp4.f;
1757 
1758  // Swap columns 2-3 if necessary
1759 
1760  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
1761  Stmp5.ui = Sa12.ui ^ Sa13.ui;
1762  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1763  Sa12.ui = Sa12.ui ^ Stmp5.ui;
1764  Sa13.ui = Sa13.ui ^ Stmp5.ui;
1765 
1766  Stmp5.ui = Sa22.ui ^ Sa23.ui;
1767  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1768  Sa22.ui = Sa22.ui ^ Stmp5.ui;
1769  Sa23.ui = Sa23.ui ^ Stmp5.ui;
1770 
1771  Stmp5.ui = Sa32.ui ^ Sa33.ui;
1772  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1773  Sa32.ui = Sa32.ui ^ Stmp5.ui;
1774  Sa33.ui = Sa33.ui ^ Stmp5.ui;
1775 
1776  Stmp5.ui = Sv12.ui ^ Sv13.ui;
1777  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1778  Sv12.ui = Sv12.ui ^ Stmp5.ui;
1779  Sv13.ui = Sv13.ui ^ Stmp5.ui;
1780 
1781  Stmp5.ui = Sv22.ui ^ Sv23.ui;
1782  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1783  Sv22.ui = Sv22.ui ^ Stmp5.ui;
1784  Sv23.ui = Sv23.ui ^ Stmp5.ui;
1785 
1786  Stmp5.ui = Sv32.ui ^ Sv33.ui;
1787  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1788  Sv32.ui = Sv32.ui ^ Stmp5.ui;
1789  Sv33.ui = Sv33.ui ^ Stmp5.ui;
1790 
1791  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
1792  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1793  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1794  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1795 
1796  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
1797  // is still a rotation
1798 
1799  Stmp5.f = -2.f;
1800  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1801  Stmp4.f = 1.f;
1802  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1803 
1804  Sa13.f = Sa13.f * Stmp4.f;
1805  Sa23.f = Sa23.f * Stmp4.f;
1806  Sa33.f = Sa33.f * Stmp4.f;
1807 
1808  Sv13.f = Sv13.f * Stmp4.f;
1809  Sv23.f = Sv23.f * Stmp4.f;
1810  Sv33.f = Sv33.f * Stmp4.f;
1811 
1812  // ###########################################################
1813  // Construct QR factorization of A*V (=U*D) using Givens rotations
1814  // ###########################################################
1815 
1816  Su11.f = 1.f;
1817  Su12.f = 0.f;
1818  Su13.f = 0.f;
1819  Su21.f = 0.f;
1820  Su22.f = 1.f;
1821  Su23.f = 0.f;
1822  Su31.f = 0.f;
1823  Su32.f = 0.f;
1824  Su33.f = 1.f;
1825 
1826  Ssh.f = Sa21.f * Sa21.f;
1827  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1828  Ssh.ui = Ssh.ui & Sa21.ui;
1829 
1830  Stmp5.f = 0.f;
1831  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1832  Sch.f = max(Sch.f, Sa11.f);
1833  Sch.f = max(Sch.f, gsmall_number);
1834  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1835 
1836  Stmp1.f = Sch.f * Sch.f;
1837  Stmp2.f = Ssh.f * Ssh.f;
1838  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1839  Stmp1.f = __frsqrt_rn(Stmp2.f);
1840 
1841  Stmp4.f = Stmp1.f * 0.5f;
1842  Stmp3.f = Stmp1.f * Stmp4.f;
1843  Stmp3.f = Stmp1.f * Stmp3.f;
1844  Stmp3.f = Stmp2.f * Stmp3.f;
1845  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1846  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1847  Stmp1.f = Stmp1.f * Stmp2.f;
1848 
1849  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1850 
1851  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1852  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1853  Sch.ui = Stmp5.ui & Sch.ui;
1854  Ssh.ui = Stmp5.ui & Ssh.ui;
1855  Sch.ui = Sch.ui | Stmp1.ui;
1856  Ssh.ui = Ssh.ui | Stmp2.ui;
1857 
1858  Stmp1.f = Sch.f * Sch.f;
1859  Stmp2.f = Ssh.f * Ssh.f;
1860  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1861  Stmp1.f = __frsqrt_rn(Stmp2.f);
1862 
1863  Stmp4.f = Stmp1.f * 0.5f;
1864  Stmp3.f = Stmp1.f * Stmp4.f;
1865  Stmp3.f = Stmp1.f * Stmp3.f;
1866  Stmp3.f = Stmp2.f * Stmp3.f;
1867  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1868  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1869 
1870  Sch.f = Sch.f * Stmp1.f;
1871  Ssh.f = Ssh.f * Stmp1.f;
1872 
1873  Sc.f = Sch.f * Sch.f;
1874  Ss.f = Ssh.f * Ssh.f;
1875  Sc.f = __fsub_rn(Sc.f, Ss.f);
1876  Ss.f = Ssh.f * Sch.f;
1877  Ss.f = __fadd_rn(Ss.f, Ss.f);
1878 
1879  // ###########################################################
1880  // Rotate matrix A
1881  // ###########################################################
1882 
1883  Stmp1.f = Ss.f * Sa11.f;
1884  Stmp2.f = Ss.f * Sa21.f;
1885  Sa11.f = Sc.f * Sa11.f;
1886  Sa21.f = Sc.f * Sa21.f;
1887  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1888  Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
1889 
1890  Stmp1.f = Ss.f * Sa12.f;
1891  Stmp2.f = Ss.f * Sa22.f;
1892  Sa12.f = Sc.f * Sa12.f;
1893  Sa22.f = Sc.f * Sa22.f;
1894  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
1895  Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
1896 
1897  Stmp1.f = Ss.f * Sa13.f;
1898  Stmp2.f = Ss.f * Sa23.f;
1899  Sa13.f = Sc.f * Sa13.f;
1900  Sa23.f = Sc.f * Sa23.f;
1901  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
1902  Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
1903 
1904  // ###########################################################
1905  // Update matrix U
1906  // ###########################################################
1907 
1908  Stmp1.f = Ss.f * Su11.f;
1909  Stmp2.f = Ss.f * Su12.f;
1910  Su11.f = Sc.f * Su11.f;
1911  Su12.f = Sc.f * Su12.f;
1912  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
1913  Su12.f = __fsub_rn(Su12.f, Stmp1.f);
1914 
1915  Stmp1.f = Ss.f * Su21.f;
1916  Stmp2.f = Ss.f * Su22.f;
1917  Su21.f = Sc.f * Su21.f;
1918  Su22.f = Sc.f * Su22.f;
1919  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
1920  Su22.f = __fsub_rn(Su22.f, Stmp1.f);
1921 
1922  Stmp1.f = Ss.f * Su31.f;
1923  Stmp2.f = Ss.f * Su32.f;
1924  Su31.f = Sc.f * Su31.f;
1925  Su32.f = Sc.f * Su32.f;
1926  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
1927  Su32.f = __fsub_rn(Su32.f, Stmp1.f);
1928 
1929  // Second Givens rotation
1930 
1931  Ssh.f = Sa31.f * Sa31.f;
1932  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1933  Ssh.ui = Ssh.ui & Sa31.ui;
1934 
1935  Stmp5.f = 0.f;
1936  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1937  Sch.f = max(Sch.f, Sa11.f);
1938  Sch.f = max(Sch.f, gsmall_number);
1939  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1940 
1941  Stmp1.f = Sch.f * Sch.f;
1942  Stmp2.f = Ssh.f * Ssh.f;
1943  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1944  Stmp1.f = __frsqrt_rn(Stmp2.f);
1945 
1946  Stmp4.f = Stmp1.f * 0.5;
1947  Stmp3.f = Stmp1.f * Stmp4.f;
1948  Stmp3.f = Stmp1.f * Stmp3.f;
1949  Stmp3.f = Stmp2.f * Stmp3.f;
1950  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1951  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1952  Stmp1.f = Stmp1.f * Stmp2.f;
1953 
1954  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1955 
1956  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1957  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1958  Sch.ui = Stmp5.ui & Sch.ui;
1959  Ssh.ui = Stmp5.ui & Ssh.ui;
1960  Sch.ui = Sch.ui | Stmp1.ui;
1961  Ssh.ui = Ssh.ui | Stmp2.ui;
1962 
1963  Stmp1.f = Sch.f * Sch.f;
1964  Stmp2.f = Ssh.f * Ssh.f;
1965  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1966  Stmp1.f = __frsqrt_rn(Stmp2.f);
1967 
1968  Stmp4.f = Stmp1.f * 0.5f;
1969  Stmp3.f = Stmp1.f * Stmp4.f;
1970  Stmp3.f = Stmp1.f * Stmp3.f;
1971  Stmp3.f = Stmp2.f * Stmp3.f;
1972  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1973  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1974 
1975  Sch.f = Sch.f * Stmp1.f;
1976  Ssh.f = Ssh.f * Stmp1.f;
1977 
1978  Sc.f = Sch.f * Sch.f;
1979  Ss.f = Ssh.f * Ssh.f;
1980  Sc.f = __fsub_rn(Sc.f, Ss.f);
1981  Ss.f = Ssh.f * Sch.f;
1982  Ss.f = __fadd_rn(Ss.f, Ss.f);
1983 
1984  // ###########################################################
1985  // Rotate matrix A
1986  // ###########################################################
1987 
1988  Stmp1.f = Ss.f * Sa11.f;
1989  Stmp2.f = Ss.f * Sa31.f;
1990  Sa11.f = Sc.f * Sa11.f;
1991  Sa31.f = Sc.f * Sa31.f;
1992  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1993  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
1994 
1995  Stmp1.f = Ss.f * Sa12.f;
1996  Stmp2.f = Ss.f * Sa32.f;
1997  Sa12.f = Sc.f * Sa12.f;
1998  Sa32.f = Sc.f * Sa32.f;
1999  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
2000  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2001 
2002  Stmp1.f = Ss.f * Sa13.f;
2003  Stmp2.f = Ss.f * Sa33.f;
2004  Sa13.f = Sc.f * Sa13.f;
2005  Sa33.f = Sc.f * Sa33.f;
2006  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
2007  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2008 
2009  // ###########################################################
2010  // Update matrix U
2011  // ###########################################################
2012 
2013  Stmp1.f = Ss.f * Su11.f;
2014  Stmp2.f = Ss.f * Su13.f;
2015  Su11.f = Sc.f * Su11.f;
2016  Su13.f = Sc.f * Su13.f;
2017  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
2018  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2019 
2020  Stmp1.f = Ss.f * Su21.f;
2021  Stmp2.f = Ss.f * Su23.f;
2022  Su21.f = Sc.f * Su21.f;
2023  Su23.f = Sc.f * Su23.f;
2024  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
2025  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2026 
2027  Stmp1.f = Ss.f * Su31.f;
2028  Stmp2.f = Ss.f * Su33.f;
2029  Su31.f = Sc.f * Su31.f;
2030  Su33.f = Sc.f * Su33.f;
2031  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
2032  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2033 
2034  // Third Givens Rotation
2035 
2036  Ssh.f = Sa32.f * Sa32.f;
2037  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
2038  Ssh.ui = Ssh.ui & Sa32.ui;
2039 
2040  Stmp5.f = 0.f;
2041  Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
2042  Sch.f = max(Sch.f, Sa22.f);
2043  Sch.f = max(Sch.f, gsmall_number);
2044  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
2045 
2046  Stmp1.f = Sch.f * Sch.f;
2047  Stmp2.f = Ssh.f * Ssh.f;
2048  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2049  Stmp1.f = __frsqrt_rn(Stmp2.f);
2050 
2051  Stmp4.f = Stmp1.f * 0.5f;
2052  Stmp3.f = Stmp1.f * Stmp4.f;
2053  Stmp3.f = Stmp1.f * Stmp3.f;
2054  Stmp3.f = Stmp2.f * Stmp3.f;
2055  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2056  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2057  Stmp1.f = Stmp1.f * Stmp2.f;
2058 
2059  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
2060 
2061  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
2062  Stmp2.ui = ~Stmp5.ui & Sch.ui;
2063  Sch.ui = Stmp5.ui & Sch.ui;
2064  Ssh.ui = Stmp5.ui & Ssh.ui;
2065  Sch.ui = Sch.ui | Stmp1.ui;
2066  Ssh.ui = Ssh.ui | Stmp2.ui;
2067 
2068  Stmp1.f = Sch.f * Sch.f;
2069  Stmp2.f = Ssh.f * Ssh.f;
2070  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2071  Stmp1.f = __frsqrt_rn(Stmp2.f);
2072 
2073  Stmp4.f = Stmp1.f * 0.5f;
2074  Stmp3.f = Stmp1.f * Stmp4.f;
2075  Stmp3.f = Stmp1.f * Stmp3.f;
2076  Stmp3.f = Stmp2.f * Stmp3.f;
2077  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2078  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2079 
2080  Sch.f = Sch.f * Stmp1.f;
2081  Ssh.f = Ssh.f * Stmp1.f;
2082 
2083  Sc.f = Sch.f * Sch.f;
2084  Ss.f = Ssh.f * Ssh.f;
2085  Sc.f = __fsub_rn(Sc.f, Ss.f);
2086  Ss.f = Ssh.f * Sch.f;
2087  Ss.f = __fadd_rn(Ss.f, Ss.f);
2088 
2089  // ###########################################################
2090  // Rotate matrix A
2091  // ###########################################################
2092 
2093  Stmp1.f = Ss.f * Sa21.f;
2094  Stmp2.f = Ss.f * Sa31.f;
2095  Sa21.f = Sc.f * Sa21.f;
2096  Sa31.f = Sc.f * Sa31.f;
2097  Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
2098  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2099 
2100  Stmp1.f = Ss.f * Sa22.f;
2101  Stmp2.f = Ss.f * Sa32.f;
2102  Sa22.f = Sc.f * Sa22.f;
2103  Sa32.f = Sc.f * Sa32.f;
2104  Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
2105  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2106 
2107  Stmp1.f = Ss.f * Sa23.f;
2108  Stmp2.f = Ss.f * Sa33.f;
2109  Sa23.f = Sc.f * Sa23.f;
2110  Sa33.f = Sc.f * Sa33.f;
2111  Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
2112  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2113 
2114  // ###########################################################
2115  // Update matrix U
2116  // ###########################################################
2117 
2118  Stmp1.f = Ss.f * Su12.f;
2119  Stmp2.f = Ss.f * Su13.f;
2120  Su12.f = Sc.f * Su12.f;
2121  Su13.f = Sc.f * Su13.f;
2122  Su12.f = __fadd_rn(Su12.f, Stmp2.f);
2123  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2124 
2125  Stmp1.f = Ss.f * Su22.f;
2126  Stmp2.f = Ss.f * Su23.f;
2127  Su22.f = Sc.f * Su22.f;
2128  Su23.f = Sc.f * Su23.f;
2129  Su22.f = __fadd_rn(Su22.f, Stmp2.f);
2130  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2131 
2132  Stmp1.f = Ss.f * Su32.f;
2133  Stmp2.f = Ss.f * Su33.f;
2134  Su32.f = Sc.f * Su32.f;
2135  Su33.f = Sc.f * Su33.f;
2136  Su32.f = __fadd_rn(Su32.f, Stmp2.f);
2137  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2138 
2139  V_3x3[0] = Sv11.f;
2140  V_3x3[1] = Sv12.f;
2141  V_3x3[2] = Sv13.f;
2142  V_3x3[3] = Sv21.f;
2143  V_3x3[4] = Sv22.f;
2144  V_3x3[5] = Sv23.f;
2145  V_3x3[6] = Sv31.f;
2146  V_3x3[7] = Sv32.f;
2147  V_3x3[8] = Sv33.f;
2148 
2149  U_3x3[0] = Su11.f;
2150  U_3x3[1] = Su12.f;
2151  U_3x3[2] = Su13.f;
2152  U_3x3[3] = Su21.f;
2153  U_3x3[4] = Su22.f;
2154  U_3x3[5] = Su23.f;
2155  U_3x3[6] = Su31.f;
2156  U_3x3[7] = Su32.f;
2157  U_3x3[8] = Su33.f;
2158 
2159  S_3x1[0] = Sa11.f;
2160  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
2161  S_3x1[1] = Sa22.f;
2162  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
2163  S_3x1[2] = Sa33.f;
2164 }
2165 
2166 template <typename scalar_t>
2168  const scalar_t *A_3x3, // input A {3,3}
2169  const scalar_t *B_3x1, // input b {3,1}
2170  scalar_t *X_3x1) // output x {3,1}
2171 {
2172  scalar_t U[9];
2173  scalar_t V[9];
2174  scalar_t S[3];
2175  svd3x3(A_3x3, U, S, V);
2176 
2177  // ###########################################################
2178  // Sigma^+
2179  // ###########################################################
2180  const scalar_t epsilon = 1e-10;
2181  S[0] = abs(S[0]) < epsilon ? 0 : 1.0 / S[0];
2182  S[1] = abs(S[1]) < epsilon ? 0 : 1.0 / S[1];
2183  S[2] = abs(S[2]) < epsilon ? 0 : 1.0 / S[2];
2184 
2185  // ###########################################################
2186  // (Sigma^+) * UT
2187  // ###########################################################
2188  scalar_t S_UT[9];
2189 
2190  S_UT[0] = U[0] * S[0];
2191  S_UT[1] = U[3] * S[0];
2192  S_UT[2] = U[6] * S[0];
2193  S_UT[3] = U[1] * S[1];
2194  S_UT[4] = U[4] * S[1];
2195  S_UT[5] = U[7] * S[1];
2196  S_UT[6] = U[2] * S[2];
2197  S_UT[7] = U[5] * S[2];
2198  S_UT[8] = U[8] * S[2];
2199 
2200  // ###########################################################
2201  // Ainv = V * [(Sigma^+) * UT]
2202  // ###########################################################
2203  scalar_t Ainv[9] = {0};
2204  matmul3x3_3x3(V, S_UT, Ainv);
2205 
2206  // ###########################################################
2207  // x = Ainv * b
2208  // ###########################################################
2209 
2210  matmul3x3_3x1(Ainv, B_3x1, X_3x1);
2211 }
2212 
2213 } // namespace kernel
2214 } // namespace linalg
2215 } // namespace core
2216 } // namespace cloudViewer
Common CUDA utilities.
#define CLOUDVIEWER_FORCE_INLINE
Definition: CUDAUtils.h:43
#define CLOUDVIEWER_DEVICE
Definition: CUDAUtils.h:45
#define __fsub_rn(x, y)
Definition: SVD3x3.h:63
#define __fadd_rn(x, y)
Definition: SVD3x3.h:62
#define __drsqrt_rn(x)
Definition: SVD3x3.h:68
#define gone
Definition: SVD3x3.h:39
#define __frsqrt_rn(x)
Definition: SVD3x3.h:64
#define gcosine_pi_over_eight
Definition: SVD3x3.h:42
#define __dadd_rn(x, y)
Definition: SVD3x3.h:66
#define gsine_pi_over_eight
Definition: SVD3x3.h:40
#define gfour_gamma_squared
Definition: SVD3x3.h:44
#define __dsub_rn(x, y)
Definition: SVD3x3.h:67
#define gtiny_number
Definition: SVD3x3.h:43
__host__ __device__ int2 abs(int2 v)
Definition: cutil_math.h:1267
int max(int a, int b)
Definition: cutil_math.h:48
static CLOUDVIEWER_DEVICE CLOUDVIEWER_FORCE_INLINE void matmul3x3_3x1(const scalar_t &m00, const scalar_t &m01, const scalar_t &m02, const scalar_t &m10, const scalar_t &m11, const scalar_t &m12, const scalar_t &m20, const scalar_t &m21, const scalar_t &m22, const scalar_t &v0, const scalar_t &v1, const scalar_t &v2, scalar_t &o0, scalar_t &o1, scalar_t &o2)
Definition: Matrix.h:18
CLOUDVIEWER_DEVICE CLOUDVIEWER_FORCE_INLINE void svd3x3< float >(const float *A_3x3, float *U_3x3, float *S_3x1, float *V_3x3)
Definition: SVD3x3.h:1130
CLOUDVIEWER_DEVICE CLOUDVIEWER_FORCE_INLINE void svd3x3< double >(const double *A_3x3, double *U_3x3, double *S_3x1, double *V_3x3)
Definition: SVD3x3.h:93
CLOUDVIEWER_DEVICE CLOUDVIEWER_FORCE_INLINE void solve_svd3x3(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *X_3x1)
Definition: SVD3x3.h:2167
CLOUDVIEWER_DEVICE CLOUDVIEWER_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
CLOUDVIEWER_DEVICE CLOUDVIEWER_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition: Matrix.h:48
Generic file read and write utility for python interface.