Skip to content

Commit 09de3c4

Browse files
committed
SIMD: faster vint4 load/store with unsigned char conversion
vint4::load from unsigned char pointer got pre-SSE4 code path. Testing on Ryzen 5950X / VS2022 (with only SSE2 enabled in the build): - vint4 load from unsigned char[]: 946.1 -> 4232.8 Mvals/sec vint4::store to unsigned char pointer got simpler/faster SSE code path, and a NEON code path. Additionally, it got test correctness coverage, including what happens to values outside of unsigned char range (current behavior just masks lowest byte, i.e. does not clamp the integer lanes). - vint4 store to unsigned char[]: 3489.8 -> 3979.3 Mvals/sec - vint8 store to unsigned char[]: 5516.9 -> 7325.3 Mvals/sec NEON code path as tested on Mac M1 Max (clang 15): - vint4 store to unsigned char[]: 4137.2 -> 6074.8 Mvals/sec Signed-off-by: Aras Pranckevicius <[email protected]>
1 parent c098d7f commit 09de3c4

File tree

2 files changed

+22
-11
lines changed

2 files changed

+22
-11
lines changed

src/include/OpenImageIO/simd.h

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4293,6 +4293,11 @@ OIIO_FORCEINLINE void vint4::load (const unsigned char *values) {
42934293
// Trickery: load one float worth of bits = 4 uchars!
42944294
simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
42954295
m_simd = _mm_cvtepu8_epi32 (a);
4296+
#elif OIIO_SIMD_SSE >= 2
4297+
// Trickery: load one float worth of bits = 4 uchars!
4298+
simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4299+
a = _mm_unpacklo_epi8(a, _mm_setzero_si128());
4300+
m_simd = _mm_unpacklo_epi16(a, _mm_setzero_si128());
42964301
#else
42974302
SIMD_CONSTRUCT (values[i]);
42984303
#endif
@@ -4784,17 +4789,15 @@ OIIO_FORCEINLINE void vint4::store (unsigned char *values) const {
47844789
#if OIIO_AVX512VL_ENABLED
47854790
_mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd);
47864791
#elif OIIO_SIMD_SSE
4787-
// Expressed as bytes and considering little endianness, we
4788-
// currently have AxBxCxDx (the 'x' means don't care).
4789-
vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000
4790-
vint4 swapped = shuffle_sse<1,0,3,2>(clamped); // B000 A000 D000 C000
4791-
vint4 shifted = swapped << 8; // 0B00 0A00 0D00 0C00
4792-
vint4 merged = clamped | shifted; // AB00 xxxx CD00 xxxx
4793-
vint4 merged2 = shuffle_sse<2,2,2,2>(merged); // CD00 ...
4794-
vint4 shifted2 = merged2 << 16; // 00CD ...
4795-
vint4 result = merged | shifted2; // ABCD ...
4796-
memcpy(values, &result, 4); // memcpy because it may be unaligned
4797-
// At this point, values[] should hold A,B,C,D
4792+
vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000
4793+
simd_t val16 = _mm_packs_epi32(clamped, _mm_setzero_si128()); // A0B0 C0D0 xxxx xxxx
4794+
simd_t val8 = _mm_packus_epi16(val16, _mm_setzero_si128()); // ABCD xxxx xxxx xxxx
4795+
_mm_store_ss((float*)values, _mm_castsi128_ps(val8));
4796+
#elif OIIO_SIMD_NEON
4797+
vint4 clamped = m_simd & vint4(0xff);
4798+
simd_t val16 = vcombine_s16(vqmovn_s32(clamped), vdup_n_s16(0));
4799+
simd_t val8 = vcombine_u8(vqmovun_s16(val16), vdup_n_u8(0));
4800+
vst1q_lane_u32((uint32_t*)values, val8, 0);
47984801
#else
47994802
SIMD_DO (values[i] = m_val[i]);
48004803
#endif

src/libutil/simd_test.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,14 @@ void test_conversion_loadstore_int ()
506506
OIIO_CHECK_SIMD_EQUAL (VEC(uc1234), C1234);
507507
OIIO_CHECK_SIMD_EQUAL (VEC( c1234), C1234);
508508

509+
// Check store to integers
510+
VEC CStep = VEC::Iota(-130, 131);
511+
unsigned char ucStepExp[] = {126, 1, 132, 7, 138, 13, 144, 19, 150, 25, 156, 31, 162, 37, 168, 43};
512+
unsigned char ucStepGot[VEC::elements] = {};
513+
CStep.store(ucStepGot);
514+
for (int i = 0; i < VEC::elements; ++i)
515+
OIIO_CHECK_EQUAL ((int)ucStepGot[i], (int)ucStepExp[i]);
516+
509517
benchmark ("load from int[]", [](const int *d){ return VEC(d); }, i1234);
510518
benchmark ("load from unsigned short[]", [](const unsigned short *d){ return VEC(d); }, us1234);
511519
benchmark ("load from short[]", [](const short *d){ return VEC(d); }, s1234);

0 commit comments

Comments
 (0)