|
275 | 275 | #endif
|
276 | 276 |
|
277 | 277 |
|
278 |
| -// Without SSE, we need to fall back on Imath for matrix44 invert |
279 |
| -#if !OIIO_SIMD_SSE && !defined(__CUDA_ARCH__) |
280 |
| -# include <OpenImageIO/Imath.h> |
281 |
| -#endif |
282 |
| - |
283 |
| - |
284 | 278 | OIIO_NAMESPACE_BEGIN
|
285 | 279 |
|
286 | 280 | namespace simd {
|
@@ -2029,6 +2023,10 @@ class vfloat4 {
|
2029 | 2023 | void load (const half *values);
|
2030 | 2024 | #endif /* _HALF_H_ or _IMATH_H_ */
|
2031 | 2025 |
|
| 2026 | + /// Load the first 2 elements from lo[0..1] and the second two elements |
| 2027 | + /// from hi[0..1]. |
| 2028 | + void load_pairs(const float* lo, const float* hi); |
| 2029 | + |
2032 | 2030 | void store (float *values) const;
|
2033 | 2031 |
|
2034 | 2032 | /// Store the first n values into memory
|
@@ -2125,6 +2123,12 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
|
2125 | 2123 | /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
|
2126 | 2124 | template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
|
2127 | 2125 |
|
| 2126 | +/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted |
| 2127 | +/// 2-bit indices packed into the template parameter i (going from the low |
| 2128 | +/// 2-bit pair to the high 2-bit pair). |
| 2129 | +template<int i> OIIO_FORCEINLINE vfloat4 |
| 2130 | +shuffle(const vfloat4& a, const vfloat4& b); |
| 2131 | + |
2128 | 2132 | /// Helper: as rapid as possible extraction of one component, when the
|
2129 | 2133 | /// index is fixed.
|
2130 | 2134 | template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
|
@@ -6897,6 +6901,19 @@ OIIO_FORCEINLINE void vfloat4::load (const half *values) {
|
6897 | 6901 | }
|
6898 | 6902 | #endif /* _HALF_H_ or _IMATH_H_ */
|
6899 | 6903 |
|
| 6904 | +OIIO_FORCEINLINE void |
| 6905 | +vfloat4::load_pairs(const float* lo, const float* hi) |
| 6906 | +{ |
| 6907 | +#if OIIO_SIMD_SSE |
| 6908 | + m_simd = _mm_loadh_pi(_mm_loadl_pi(Zero(), (__m64*)lo), (__m64*)hi); |
| 6909 | +#else |
| 6910 | + m_val[0] = lo[0]; |
| 6911 | + m_val[1] = lo[1]; |
| 6912 | + m_val[2] = hi[0]; |
| 6913 | + m_val[3] = hi[1]; |
| 6914 | +#endif |
| 6915 | +} |
| 6916 | + |
6900 | 6917 | OIIO_FORCEINLINE void vfloat4::store (float *values) const {
|
6901 | 6918 | #if OIIO_SIMD_SSE
|
6902 | 6919 | // Use an unaligned store -- it's just as fast when the memory turns
|
@@ -7338,6 +7355,18 @@ template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
|
7338 | 7355 | #endif
|
7339 | 7356 |
|
7340 | 7357 |
|
| 7358 | +template<int i> |
| 7359 | +OIIO_FORCEINLINE vfloat4 |
| 7360 | +shuffle(const vfloat4& a, const vfloat4& b) |
| 7361 | +{ |
| 7362 | +#if OIIO_SIMD_SSE |
| 7363 | + return vfloat4(_mm_shuffle_ps(a, b, i)); |
| 7364 | +#else |
| 7365 | + return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)], |
| 7366 | + b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]); |
| 7367 | +#endif |
| 7368 | +} |
| 7369 | + |
7341 | 7370 |
|
7342 | 7371 | /// Helper: as rapid as possible extraction of one component, when the
|
7343 | 7372 | /// index is fixed.
|
@@ -8464,23 +8493,40 @@ OIIO_FORCEINLINE bool operator!= (M44fParam a, const matrix44 &b) {
|
8464 | 8493 | }
|
8465 | 8494 |
|
8466 | 8495 |
|
8467 |
| -#if OIIO_SIMD_SSE |
8468 |
| -OIIO_FORCEINLINE matrix44 matrix44::inverse() const { |
| 8496 | + |
| 8497 | +inline matrix44 matrix44::inverse() const |
| 8498 | +{ |
8469 | 8499 | // Adapted from this code from Intel:
|
8470 | 8500 | // ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
|
8471 | 8501 | vfloat4 minor0, minor1, minor2, minor3;
|
8472 |
| - vfloat4 row0, row1, row2, row3; |
8473 | 8502 | vfloat4 det, tmp1;
|
8474 |
| - const float *src = (const float *)this; |
8475 |
| - vfloat4 zero = vfloat4::Zero(); |
8476 |
| - tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4))); |
8477 |
| - row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12))); |
8478 |
| - row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88)); |
8479 |
| - row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD)); |
8480 |
| - tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6))); |
8481 |
| - row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14))); |
8482 |
| - row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88)); |
8483 |
| - row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD)); |
| 8503 | +#if 0 |
| 8504 | + // Original code looked like this: |
| 8505 | + vfloat4 row0, row1, row2, row3; |
| 8506 | + const float *src = (const float *)&msrc; |
| 8507 | + tmp1.load_pairs(src, src+ 4); |
| 8508 | + row1.load_pairs(src+8, src+12); |
| 8509 | + row0 = shuffle<0x88>(tmp1, row1); |
| 8510 | + row1 = shuffle<0xDD>(row1, tmp1); |
| 8511 | + tmp1.load_pairs(src+ 2, src+ 6); |
| 8512 | + row3.load_pairs(src+10, src+14); |
| 8513 | + row2 = shuffle<0x88>(tmp1, row3); |
| 8514 | + row3 = shuffle<0xDD>(row3, tmp1); |
| 8515 | +#else |
| 8516 | + // But this is simpler and easier to understand: |
| 8517 | + matrix44 Mt = this->transposed(); |
| 8518 | + vfloat4 row0 = Mt[0]; |
| 8519 | + vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]); |
| 8520 | + vfloat4 row2 = Mt[2]; |
| 8521 | + vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]); |
| 8522 | +#endif |
| 8523 | + // At this point, the row variables should contain the following indices |
| 8524 | + // of the original input matrix: |
| 8525 | + // row0 = 0 4 8 12 |
| 8526 | + // row1 = 9 13 1 5 |
| 8527 | + // row2 = 2 6 10 14 |
| 8528 | + // row3 = 11 15 3 7 |
| 8529 | + |
8484 | 8530 | // -----------------------------------------------
|
8485 | 8531 | tmp1 = row2 * row3;
|
8486 | 8532 | tmp1 = shuffle<1,0,3,2>(tmp1);
|
@@ -8535,20 +8581,13 @@ OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
|
8535 | 8581 | minor3 = (row1 * tmp1) + minor3;
|
8536 | 8582 | // -----------------------------------------------
|
8537 | 8583 | det = row0 * minor0;
|
8538 |
| - det = shuffle<2,3,0,1>(det) + det; |
8539 |
| - det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det)); |
8540 |
| - tmp1 = vfloat4(_mm_rcp_ss(det)); |
8541 |
| - det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)))); |
8542 |
| - det = shuffle<0>(det); |
| 8584 | + float det0 = reduce_add(det); |
| 8585 | + float tmp1_0 = 1.0f / det0; |
| 8586 | + det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0); |
| 8587 | + det = vfloat4(det0); |
8543 | 8588 | return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
|
8544 | 8589 | }
|
8545 |
| -#elif defined(INCLUDED_IMATHMATRIX_H) |
8546 |
| -OIIO_FORCEINLINE matrix44 matrix44::inverse() const { |
8547 |
| - return matrix44 (((Imath::M44f*)this)->inverse()); |
8548 |
| -} |
8549 |
| -#else |
8550 |
| -#error "Don't know how to compute matrix44::inverse()" |
8551 |
| -#endif |
| 8590 | + |
8552 | 8591 |
|
8553 | 8592 |
|
8554 | 8593 | inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {
|
|
0 commit comments