Skip to content

Commit a885ca3

Browse files
authored
fix(simd.h): Make all-architecture matrix44::inverse() (#4076)
We had an awkward situation of the simd matrix44::inverse() only being defined for Intel SSE, and in all other cases falling back on Imath::M44f::inverse. The problem with that is that it made simd.h depend either directly on ImathMatrix.h, or assume that it would be included prior to inclusion of simd.h. This was not good. This patch rewrites inverse() to eliminate all of the direct SSE intrinsics in terms of other simd wrapper functions, which means that they will be defined one way or another on all architectures. Along the way I added a new vfloat4 shuffle that combines two vectors (corresponding to `_mm_shuffle_ps` in SSE land), and a vfloat4 constructor from 2 float*', where the first two elements come from the first ptr and the second two elements come from the second ptr. Ultimately, the final implementation I settled on for the new inverse didn't use these (an earlier version did), but I want to keep them around anyway in case they are useful in the future. I did add a test for them. Signed-off-by: Larry Gritz <[email protected]>
1 parent 0b7c289 commit a885ca3

File tree

2 files changed

+107
-32
lines changed

2 files changed

+107
-32
lines changed

src/include/OpenImageIO/simd.h

Lines changed: 70 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -275,12 +275,6 @@
275275
#endif
276276

277277

278-
// Without SSE, we need to fall back on Imath for matrix44 invert
279-
#if !OIIO_SIMD_SSE && !defined(__CUDA_ARCH__)
280-
# include <OpenImageIO/Imath.h>
281-
#endif
282-
283-
284278
OIIO_NAMESPACE_BEGIN
285279

286280
namespace simd {
@@ -2029,6 +2023,10 @@ class vfloat4 {
20292023
void load (const half *values);
20302024
#endif /* _HALF_H_ or _IMATH_H_ */
20312025

2026+
/// Load the first 2 elements from lo[0..1] and the second two elements
2027+
/// from hi[0..1].
2028+
void load_pairs(const float* lo, const float* hi);
2029+
20322030
void store (float *values) const;
20332031

20342032
/// Store the first n values into memory
@@ -2125,6 +2123,12 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
21252123
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
21262124
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
21272125

2126+
/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
2127+
/// 2-bit indices packed into the template parameter i (going from the low
2128+
/// 2-bit pair to the high 2-bit pair).
2129+
template<int i> OIIO_FORCEINLINE vfloat4
2130+
shuffle(const vfloat4& a, const vfloat4& b);
2131+
21282132
/// Helper: as rapid as possible extraction of one component, when the
21292133
/// index is fixed.
21302134
template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
@@ -6897,6 +6901,19 @@ OIIO_FORCEINLINE void vfloat4::load (const half *values) {
68976901
}
68986902
#endif /* _HALF_H_ or _IMATH_H_ */
68996903

6904+
OIIO_FORCEINLINE void
6905+
vfloat4::load_pairs(const float* lo, const float* hi)
6906+
{
6907+
#if OIIO_SIMD_SSE
6908+
m_simd = _mm_loadh_pi(_mm_loadl_pi(Zero(), (__m64*)lo), (__m64*)hi);
6909+
#else
6910+
m_val[0] = lo[0];
6911+
m_val[1] = lo[1];
6912+
m_val[2] = hi[0];
6913+
m_val[3] = hi[1];
6914+
#endif
6915+
}
6916+
69006917
OIIO_FORCEINLINE void vfloat4::store (float *values) const {
69016918
#if OIIO_SIMD_SSE
69026919
// Use an unaligned store -- it's just as fast when the memory turns
@@ -7338,6 +7355,18 @@ template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
73387355
#endif
73397356

73407357

7358+
template<int i>
7359+
OIIO_FORCEINLINE vfloat4
7360+
shuffle(const vfloat4& a, const vfloat4& b)
7361+
{
7362+
#if OIIO_SIMD_SSE
7363+
return vfloat4(_mm_shuffle_ps(a, b, i));
7364+
#else
7365+
return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)],
7366+
b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]);
7367+
#endif
7368+
}
7369+
73417370

73427371
/// Helper: as rapid as possible extraction of one component, when the
73437372
/// index is fixed.
@@ -8464,23 +8493,40 @@ OIIO_FORCEINLINE bool operator!= (M44fParam a, const matrix44 &b) {
84648493
}
84658494

84668495

8467-
#if OIIO_SIMD_SSE
8468-
OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
8496+
8497+
inline matrix44 matrix44::inverse() const
8498+
{
84698499
// Adapted from this code from Intel:
84708500
// ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
84718501
vfloat4 minor0, minor1, minor2, minor3;
8472-
vfloat4 row0, row1, row2, row3;
84738502
vfloat4 det, tmp1;
8474-
const float *src = (const float *)this;
8475-
vfloat4 zero = vfloat4::Zero();
8476-
tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
8477-
row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
8478-
row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
8479-
row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
8480-
tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
8481-
row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
8482-
row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
8483-
row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
8503+
#if 0
8504+
// Original code looked like this:
8505+
vfloat4 row0, row1, row2, row3;
8506+
const float *src = (const float *)&msrc;
8507+
tmp1.load_pairs(src, src+ 4);
8508+
row1.load_pairs(src+8, src+12);
8509+
row0 = shuffle<0x88>(tmp1, row1);
8510+
row1 = shuffle<0xDD>(row1, tmp1);
8511+
tmp1.load_pairs(src+ 2, src+ 6);
8512+
row3.load_pairs(src+10, src+14);
8513+
row2 = shuffle<0x88>(tmp1, row3);
8514+
row3 = shuffle<0xDD>(row3, tmp1);
8515+
#else
8516+
// But this is simpler and easier to understand:
8517+
matrix44 Mt = this->transposed();
8518+
vfloat4 row0 = Mt[0];
8519+
vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]);
8520+
vfloat4 row2 = Mt[2];
8521+
vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]);
8522+
#endif
8523+
// At this point, the row variables should contain the following indices
8524+
// of the original input matrix:
8525+
// row0 = 0 4 8 12
8526+
// row1 = 9 13 1 5
8527+
// row2 = 2 6 10 14
8528+
// row3 = 11 15 3 7
8529+
84848530
// -----------------------------------------------
84858531
tmp1 = row2 * row3;
84868532
tmp1 = shuffle<1,0,3,2>(tmp1);
@@ -8535,20 +8581,13 @@ OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
85358581
minor3 = (row1 * tmp1) + minor3;
85368582
// -----------------------------------------------
85378583
det = row0 * minor0;
8538-
det = shuffle<2,3,0,1>(det) + det;
8539-
det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
8540-
tmp1 = vfloat4(_mm_rcp_ss(det));
8541-
det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
8542-
det = shuffle<0>(det);
8584+
float det0 = reduce_add(det);
8585+
float tmp1_0 = 1.0f / det0;
8586+
det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
8587+
det = vfloat4(det0);
85438588
return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
85448589
}
8545-
#elif defined(INCLUDED_IMATHMATRIX_H)
8546-
OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
8547-
return matrix44 (((Imath::M44f*)this)->inverse());
8548-
}
8549-
#else
8550-
#error "Don't know how to compute matrix44::inverse()"
8551-
#endif
8590+
85528591

85538592

85548593
inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {

src/libutil/simd_test.cpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,22 @@ test_special()
15591559
OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)/vfloat4(127.0), vfloat4(1.0f));
15601560
OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)*vfloat4(1.0f/127.0), vfloat4(1.0f));
15611561
}
1562+
1563+
// Test the 2-vfloat4 shuffle
1564+
{
1565+
#define PERMUTE(a,b,c,d) ((d<<6)|(c<<4)|(b<<2)|(a<<0))
1566+
vfloat4 a(10, 11, 12, 13);
1567+
vfloat4 b(20, 21, 22, 23);
1568+
OIIO_CHECK_SIMD_EQUAL(shuffle<PERMUTE(2,0,1,3)>(a,b),
1569+
vfloat4(12, 10, 21, 23));
1570+
}
1571+
// Test vfloat4::load_pairs
1572+
{
1573+
vfloat4 x;
1574+
static const float vals[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
1575+
x.load_pairs(vals+2, vals+5);
1576+
OIIO_CHECK_SIMD_EQUAL(x, vfloat4(2, 3, 5, 6));
1577+
}
15621578
}
15631579

15641580

@@ -1825,7 +1841,7 @@ test_matrix()
18251841
{
18261842
Imath::V3f P(1.0f, 0.0f, 0.0f);
18271843
Imath::M44f Mtrans(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 10, 11, 12, 1);
1828-
Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI_2, 0.0f));
1844+
Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI/4.0f, 0.0f));
18291845

18301846
test_heading("Testing matrix ops:");
18311847
std::cout << " P = " << P << "\n";
@@ -1878,6 +1894,26 @@ test_matrix()
18781894
OIIO_CHECK_ASSERT(
18791895
mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
18801896
1.0e-6f));
1897+
1898+
// Test that matrix44::inverse always matches Imath::M44f::inverse
1899+
Imath::M44f rts = (Mtrans * Mrot) * Imath::M44f(2.0f, 0.0f, 0.0f, 0.0f,
1900+
0.0f, 1.0f, 0.0f, 0.0f,
1901+
0.0f, 0.0f, 1.0f, 0.0f,
1902+
0.0f, 0.0f, 0.0f, 1.0f);
1903+
OIIO_CHECK_ASSERT(
1904+
mx_equal_thresh(matrix44(rts.inverse()), matrix44(rts).inverse(),
1905+
1.0e-5f));
1906+
OIIO_CHECK_ASSERT(
1907+
mx_equal_thresh(matrix44(Mtrans.inverse()), matrix44(Mtrans).inverse(),
1908+
1.0e-6f));
1909+
OIIO_CHECK_ASSERT(
1910+
mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
1911+
1.0e-6f));
1912+
Imath::M44f m123(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 1.0f);
1913+
OIIO_CHECK_ASSERT(
1914+
mx_equal_thresh(matrix44(m123.inverse()), matrix44(m123).inverse(),
1915+
1.0e-6f));
1916+
18811917
OIIO_CHECK_EQUAL(
18821918
matrix44(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
18831919
Imath::M44f(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));

0 commit comments

Comments
 (0)