@@ -2029,6 +2029,10 @@ class vfloat4 {
2029
2029
void load (const half *values);
2030
2030
#endif /* _HALF_H_ or _IMATH_H_ */
2031
2031
2032
+ // / Load the first 2 elements from lo[0..1] and the second two elements
2033
+ // / from hi[0..1].
2034
+ void load_pairs (const float * lo, const float * hi);
2035
+
2032
2036
void store (float *values) const ;
2033
2037
2034
2038
// / Store the first n values into memory
@@ -2125,6 +2129,12 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2125
2129
// / shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2126
2130
template <int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2127
2131
2132
+ // / Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
2133
+ // / 2-bit indices packed into the template parameter i (going from the low
2134
+ // / 2-bit pair to the high 2-bit pair).
2135
+ template <int i> OIIO_FORCEINLINE vfloat4
2136
+ shuffle (const vfloat4& a, const vfloat4& b);
2137
+
2128
2138
// / Helper: as rapid as possible extraction of one component, when the
2129
2139
// / index is fixed.
2130
2140
template <int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
@@ -6897,6 +6907,19 @@ OIIO_FORCEINLINE void vfloat4::load (const half *values) {
6897
6907
}
6898
6908
#endif /* _HALF_H_ or _IMATH_H_ */
6899
6909
6910
+ OIIO_FORCEINLINE void
6911
+ vfloat4::load_pairs (const float * lo, const float * hi)
6912
+ {
6913
+ #if OIIO_SIMD_SSE
6914
+ m_simd = _mm_loadh_pi (_mm_loadl_pi (Zero (), (__m64*)lo), (__m64*)hi);
6915
+ #else
6916
+ m_val[0 ] = lo[0 ];
6917
+ m_val[1 ] = lo[1 ];
6918
+ m_val[2 ] = hi[0 ];
6919
+ m_val[3 ] = hi[1 ];
6920
+ #endif
6921
+ }
6922
+
6900
6923
OIIO_FORCEINLINE void vfloat4::store (float *values) const {
6901
6924
#if OIIO_SIMD_SSE
6902
6925
// Use an unaligned store -- it's just as fast when the memory turns
@@ -7338,6 +7361,18 @@ template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
7338
7361
#endif
7339
7362
7340
7363
7364
+ template <int i>
7365
+ OIIO_FORCEINLINE vfloat4
7366
+ shuffle (const vfloat4& a, const vfloat4& b)
7367
+ {
7368
+ #if OIIO_SIMD_SSE
7369
+ return vfloat4 (_mm_shuffle_ps (a, b, i));
7370
+ #else
7371
+ return vfloat4 (a[i & 0x03 ], a[(i >> 2 ) & (0x03 )],
7372
+ b[(i >> 4 ) & 0x03 ], b[(i >> 6 ) & (0x03 )]);
7373
+ #endif
7374
+ }
7375
+
7341
7376
7342
7377
// / Helper: as rapid as possible extraction of one component, when the
7343
7378
// / index is fixed.
@@ -8464,23 +8499,40 @@ OIIO_FORCEINLINE bool operator!= (M44fParam a, const matrix44 &b) {
8464
8499
}
8465
8500
8466
8501
8467
- #if OIIO_SIMD_SSE
8468
- OIIO_FORCEINLINE matrix44 matrix44::inverse () const {
8502
+
8503
+ inline matrix44 matrix44::inverse () const
8504
+ {
8469
8505
// Adapted from this code from Intel:
8470
8506
// ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
8471
8507
vfloat4 minor0, minor1, minor2, minor3;
8472
- vfloat4 row0, row1, row2, row3;
8473
8508
vfloat4 det, tmp1;
8474
- const float *src = (const float *)this ;
8475
- vfloat4 zero = vfloat4::Zero ();
8476
- tmp1 = vfloat4 (_mm_loadh_pi (_mm_loadl_pi (zero, (__m64*)(src)), (__m64*)(src+ 4 )));
8477
- row1 = vfloat4 (_mm_loadh_pi (_mm_loadl_pi (zero, (__m64*)(src+8 )), (__m64*)(src+12 )));
8478
- row0 = vfloat4 (_mm_shuffle_ps (tmp1, row1, 0x88 ));
8479
- row1 = vfloat4 (_mm_shuffle_ps (row1, tmp1, 0xDD ));
8480
- tmp1 = vfloat4 (_mm_loadh_pi (_mm_loadl_pi (tmp1, (__m64*)(src+ 2 )), (__m64*)(src+ 6 )));
8481
- row3 = vfloat4 (_mm_loadh_pi (_mm_loadl_pi (zero, (__m64*)(src+10 )), (__m64*)(src+14 )));
8482
- row2 = vfloat4 (_mm_shuffle_ps (tmp1, row3, 0x88 ));
8483
- row3 = vfloat4 (_mm_shuffle_ps (row3, tmp1, 0xDD ));
8509
+ #if 0
8510
+ // Original code looked like this:
8511
+ vfloat4 row0, row1, row2, row3;
8512
+ const float *src = (const float *)&msrc;
8513
+ tmp1.load_pairs(src, src+ 4);
8514
+ row1.load_pairs(src+8, src+12);
8515
+ row0 = shuffle<0x88>(tmp1, row1);
8516
+ row1 = shuffle<0xDD>(row1, tmp1);
8517
+ tmp1.load_pairs(src+ 2, src+ 6);
8518
+ row3.load_pairs(src+10, src+14);
8519
+ row2 = shuffle<0x88>(tmp1, row3);
8520
+ row3 = shuffle<0xDD>(row3, tmp1);
8521
+ #else
8522
+ // But this is simpler and easier to understand:
8523
+ matrix44 Mt = this ->transposed ();
8524
+ vfloat4 row0 = Mt[0 ];
8525
+ vfloat4 row1 = shuffle<2 ,3 ,0 ,1 >(Mt[1 ]);
8526
+ vfloat4 row2 = Mt[2 ];
8527
+ vfloat4 row3 = shuffle<2 ,3 ,0 ,1 >(Mt[3 ]);
8528
+ #endif
8529
+ // At this point, the row variables should contain the following indices
8530
+ // of the original input matrix:
8531
+ // row0 = 0 4 8 12
8532
+ // row1 = 9 13 1 5
8533
+ // row2 = 2 6 10 14
8534
+ // row3 = 11 15 3 7
8535
+
8484
8536
// -----------------------------------------------
8485
8537
tmp1 = row2 * row3;
8486
8538
tmp1 = shuffle<1 ,0 ,3 ,2 >(tmp1);
@@ -8535,20 +8587,13 @@ OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
8535
8587
minor3 = (row1 * tmp1) + minor3;
8536
8588
// -----------------------------------------------
8537
8589
det = row0 * minor0;
8538
- det = shuffle<2 ,3 ,0 ,1 >(det) + det;
8539
- det = vfloat4 (_mm_add_ss (shuffle<1 ,0 ,3 ,2 >(det), det));
8540
- tmp1 = vfloat4 (_mm_rcp_ss (det));
8541
- det = vfloat4 (_mm_sub_ss (_mm_add_ss (tmp1, tmp1), _mm_mul_ss (det, _mm_mul_ss (tmp1, tmp1))));
8542
- det = shuffle<0 >(det);
8590
+ float det0 = reduce_add (det);
8591
+ float tmp1_0 = 1 .0f / det0;
8592
+ det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
8593
+ det = vfloat4 (det0);
8543
8594
return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
8544
8595
}
8545
- #elif defined(INCLUDED_IMATHMATRIX_H)
8546
- OIIO_FORCEINLINE matrix44 matrix44::inverse () const {
8547
- return matrix44 (((Imath::M44f*)this )->inverse ());
8548
- }
8549
- #else
8550
- #error "Don't know how to compute matrix44::inverse()"
8551
- #endif
8596
+
8552
8597
8553
8598
8554
8599
inline std::ostream& operator << (std::ostream& cout, const matrix44 &M) {
0 commit comments