Skip to content

Commit fa0bf3a

Browse files
committed
fix(simd.h): Make all-architecture matrix44::inverse()
We had an awkward situation of the simd matrix44::inverse() only being defined for Intel SSE, and in all other cases falling back on Imath::M44f::inverse. The problem with that is that it made simd.h depend either directly on ImathMatrix.h, or assume that it would be included prior to inclusion of simd.h. This was not good. This patch rewrites inverse() to eliminate all of the direct SSE intrinsics in terms of other simd wrapper functions, which means that they will be defined one way or another on all architectures. Along the way I added a new vfloat4 shuffle that combines two vectors (corresponding to _mm_shuffle_ps in SSE land), and a vfloat4 constructor from 2 float*', where the first two elements come from the first ptr and the second two elements come from the second ptr. Ultimately, the final implementation I settled on for the new inverse didn't use these (an earlier version did), but I want to keep them around anyway in case they are useful in the future. I did add a test for them. Signed-off-by: Larry Gritz <[email protected]>
1 parent fd6a195 commit fa0bf3a

File tree

2 files changed

+107
-26
lines changed

2 files changed

+107
-26
lines changed

src/include/OpenImageIO/simd.h

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2029,6 +2029,10 @@ class vfloat4 {
20292029
void load (const half *values);
20302030
#endif /* _HALF_H_ or _IMATH_H_ */
20312031

2032+
/// Load the first 2 elements from lo[0..1] and the second two elements
2033+
/// from hi[0..1].
2034+
void load_pairs(const float* lo, const float* hi);
2035+
20322036
void store (float *values) const;
20332037

20342038
/// Store the first n values into memory
@@ -2125,6 +2129,12 @@ OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
21252129
/// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
21262130
template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
21272131

2132+
/// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
2133+
/// 2-bit indices packed into the template parameter i (going from the low
2134+
/// 2-bit pair to the high 2-bit pair).
2135+
template<int i> OIIO_FORCEINLINE vfloat4
2136+
shuffle(const vfloat4& a, const vfloat4& b);
2137+
21282138
/// Helper: as rapid as possible extraction of one component, when the
21292139
/// index is fixed.
21302140
template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
@@ -6897,6 +6907,19 @@ OIIO_FORCEINLINE void vfloat4::load (const half *values) {
68976907
}
68986908
#endif /* _HALF_H_ or _IMATH_H_ */
68996909

6910+
OIIO_FORCEINLINE void
6911+
vfloat4::load_pairs(const float* lo, const float* hi)
6912+
{
6913+
#if OIIO_SIMD_SSE
6914+
m_simd = _mm_loadh_pi(_mm_loadl_pi(Zero(), (__m64*)lo), (__m64*)hi);
6915+
#else
6916+
m_val[0] = lo[0];
6917+
m_val[1] = lo[1];
6918+
m_val[2] = hi[0];
6919+
m_val[3] = hi[1];
6920+
#endif
6921+
}
6922+
69006923
OIIO_FORCEINLINE void vfloat4::store (float *values) const {
69016924
#if OIIO_SIMD_SSE
69026925
// Use an unaligned store -- it's just as fast when the memory turns
@@ -7338,6 +7361,18 @@ template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
73387361
#endif
73397362

73407363

7364+
template<int i>
7365+
OIIO_FORCEINLINE vfloat4
7366+
shuffle(const vfloat4& a, const vfloat4& b)
7367+
{
7368+
#if OIIO_SIMD_SSE
7369+
return vfloat4(_mm_shuffle_ps(a, b, i));
7370+
#else
7371+
return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)],
7372+
b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]);
7373+
#endif
7374+
}
7375+
73417376

73427377
/// Helper: as rapid as possible extraction of one component, when the
73437378
/// index is fixed.
@@ -8464,23 +8499,40 @@ OIIO_FORCEINLINE bool operator!= (M44fParam a, const matrix44 &b) {
84648499
}
84658500

84668501

8467-
#if OIIO_SIMD_SSE
8468-
OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
8502+
8503+
inline matrix44 matrix44::inverse() const
8504+
{
84698505
// Adapted from this code from Intel:
84708506
// ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
84718507
vfloat4 minor0, minor1, minor2, minor3;
8472-
vfloat4 row0, row1, row2, row3;
84738508
vfloat4 det, tmp1;
8474-
const float *src = (const float *)this;
8475-
vfloat4 zero = vfloat4::Zero();
8476-
tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
8477-
row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
8478-
row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
8479-
row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
8480-
tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
8481-
row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
8482-
row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
8483-
row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
8509+
#if 0
8510+
// Original code looked like this:
8511+
vfloat4 row0, row1, row2, row3;
8512+
const float *src = (const float *)&msrc;
8513+
tmp1.load_pairs(src, src+ 4);
8514+
row1.load_pairs(src+8, src+12);
8515+
row0 = shuffle<0x88>(tmp1, row1);
8516+
row1 = shuffle<0xDD>(row1, tmp1);
8517+
tmp1.load_pairs(src+ 2, src+ 6);
8518+
row3.load_pairs(src+10, src+14);
8519+
row2 = shuffle<0x88>(tmp1, row3);
8520+
row3 = shuffle<0xDD>(row3, tmp1);
8521+
#else
8522+
// But this is simpler and easier to understand:
8523+
matrix44 Mt = this->transposed();
8524+
vfloat4 row0 = Mt[0];
8525+
vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]);
8526+
vfloat4 row2 = Mt[2];
8527+
vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]);
8528+
#endif
8529+
// At this point, the row variables should contain the following indices
8530+
// of the original input matrix:
8531+
// row0 = 0 4 8 12
8532+
// row1 = 9 13 1 5
8533+
// row2 = 2 6 10 14
8534+
// row3 = 11 15 3 7
8535+
84848536
// -----------------------------------------------
84858537
tmp1 = row2 * row3;
84868538
tmp1 = shuffle<1,0,3,2>(tmp1);
@@ -8535,20 +8587,13 @@ OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
85358587
minor3 = (row1 * tmp1) + minor3;
85368588
// -----------------------------------------------
85378589
det = row0 * minor0;
8538-
det = shuffle<2,3,0,1>(det) + det;
8539-
det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
8540-
tmp1 = vfloat4(_mm_rcp_ss(det));
8541-
det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
8542-
det = shuffle<0>(det);
8590+
float det0 = reduce_add(det);
8591+
float tmp1_0 = 1.0f / det0;
8592+
det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
8593+
det = vfloat4(det0);
85438594
return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
85448595
}
8545-
#elif defined(INCLUDED_IMATHMATRIX_H)
8546-
OIIO_FORCEINLINE matrix44 matrix44::inverse() const {
8547-
return matrix44 (((Imath::M44f*)this)->inverse());
8548-
}
8549-
#else
8550-
#error "Don't know how to compute matrix44::inverse()"
8551-
#endif
8596+
85528597

85538598

85548599
inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {

src/libutil/simd_test.cpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,22 @@ test_special()
15591559
OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)/vfloat4(127.0), vfloat4(1.0f));
15601560
OIIO_CHECK_SIMD_EQUAL (vfloat4(c127)*vfloat4(1.0f/127.0), vfloat4(1.0f));
15611561
}
1562+
1563+
// Test the 2-vfloat4 shuffle
1564+
{
1565+
#define PERMUTE(a,b,c,d) ((d<<6)|(c<<4)|(b<<2)|(a<<0))
1566+
vfloat4 a(10, 11, 12, 13);
1567+
vfloat4 b(20, 21, 22, 23);
1568+
OIIO_CHECK_SIMD_EQUAL(shuffle<PERMUTE(2,0,1,3)>(a,b),
1569+
vfloat4(12, 10, 21, 23));
1570+
}
1571+
// Test vfloat4::load_pairs
1572+
{
1573+
vfloat4 x;
1574+
static const float vals[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
1575+
x.load_pairs(vals+2, vals+5);
1576+
OIIO_CHECK_SIMD_EQUAL(x, vfloat4(2, 3, 5, 6));
1577+
}
15621578
}
15631579

15641580

@@ -1825,7 +1841,7 @@ test_matrix()
18251841
{
18261842
Imath::V3f P(1.0f, 0.0f, 0.0f);
18271843
Imath::M44f Mtrans(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 10, 11, 12, 1);
1828-
Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI_2, 0.0f));
1844+
Imath::M44f Mrot = Imath::M44f().rotate(Imath::V3f(0.0f, M_PI/4.0f, 0.0f));
18291845

18301846
test_heading("Testing matrix ops:");
18311847
std::cout << " P = " << P << "\n";
@@ -1878,6 +1894,26 @@ test_matrix()
18781894
OIIO_CHECK_ASSERT(
18791895
mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
18801896
1.0e-6f));
1897+
1898+
// Test that matrix44::inverse always matches Imath::M44f::inverse
1899+
Imath::M44f rts = (Mtrans * Mrot) * Imath::M44f(2.0f, 0.0f, 0.0f, 0.0f,
1900+
0.0f, 1.0f, 0.0f, 0.0f,
1901+
0.0f, 0.0f, 1.0f, 0.0f,
1902+
0.0f, 0.0f, 0.0f, 1.0f);
1903+
OIIO_CHECK_ASSERT(
1904+
mx_equal_thresh(matrix44(rts.inverse()), matrix44(rts).inverse(),
1905+
1.0e-5f));
1906+
OIIO_CHECK_ASSERT(
1907+
mx_equal_thresh(matrix44(Mtrans.inverse()), matrix44(Mtrans).inverse(),
1908+
1.0e-6f));
1909+
OIIO_CHECK_ASSERT(
1910+
mx_equal_thresh(matrix44(Mrot.inverse()), matrix44(Mrot).inverse(),
1911+
1.0e-6f));
1912+
Imath::M44f m123(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 1.0f);
1913+
OIIO_CHECK_ASSERT(
1914+
mx_equal_thresh(matrix44(m123.inverse()), matrix44(m123).inverse(),
1915+
1.0e-6f));
1916+
18811917
OIIO_CHECK_EQUAL(
18821918
matrix44(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
18831919
Imath::M44f(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));

0 commit comments

Comments
 (0)