diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h index 277ddf9fb2..5799a56127 100644 --- a/src/include/OpenImageIO/simd.h +++ b/src/include/OpenImageIO/simd.h @@ -4293,6 +4293,11 @@ OIIO_FORCEINLINE void vint4::load (const unsigned char *values) { // Trickery: load one float worth of bits = 4 uchars! simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values)); m_simd = _mm_cvtepu8_epi32 (a); +#elif OIIO_SIMD_SSE >= 2 + // Trickery: load one float worth of bits = 4 uchars! + simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values)); + a = _mm_unpacklo_epi8(a, _mm_setzero_si128()); + m_simd = _mm_unpacklo_epi16(a, _mm_setzero_si128()); #else SIMD_CONSTRUCT (values[i]); #endif @@ -4784,17 +4789,15 @@ OIIO_FORCEINLINE void vint4::store (unsigned char *values) const { #if OIIO_AVX512VL_ENABLED _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd); #elif OIIO_SIMD_SSE - // Expressed as bytes and considering little endianness, we - // currently have AxBxCxDx (the 'x' means don't care). - vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000 - vint4 swapped = shuffle_sse<1,0,3,2>(clamped); // B000 A000 D000 C000 - vint4 shifted = swapped << 8; // 0B00 0A00 0D00 0C00 - vint4 merged = clamped | shifted; // AB00 xxxx CD00 xxxx - vint4 merged2 = shuffle_sse<2,2,2,2>(merged); // CD00 ... - vint4 shifted2 = merged2 << 16; // 00CD ... - vint4 result = merged | shifted2; // ABCD ... - memcpy(values, &result, 4); // memcpy because it may be unaligned - // At this point, values[] should hold A,B,C,D + vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000 + simd_t val16 = _mm_packs_epi32(clamped, _mm_setzero_si128()); // A0B0 C0D0 xxxx xxxx + simd_t val8 = _mm_packus_epi16(val16, _mm_setzero_si128()); // ABCD xxxx xxxx xxxx + _mm_store_ss((float*)values, _mm_castsi128_ps(val8)); +#elif OIIO_SIMD_NEON + vint4 clamped = m_simd & vint4(0xff); + simd_t val16 = vcombine_s16(vqmovn_s32(clamped), vdup_n_s16(0)); + simd_t val8 = vcombine_u8(vqmovun_s16(val16), vdup_n_u8(0)); + vst1q_lane_u32((uint32_t*)values, val8, 0); #else SIMD_DO (values[i] = m_val[i]); #endif diff --git a/src/libutil/simd_test.cpp b/src/libutil/simd_test.cpp index 5e056d9d11..e94f3a1305 100644 --- a/src/libutil/simd_test.cpp +++ b/src/libutil/simd_test.cpp @@ -506,6 +506,14 @@ void test_conversion_loadstore_int () OIIO_CHECK_SIMD_EQUAL (VEC(uc1234), C1234); OIIO_CHECK_SIMD_EQUAL (VEC( c1234), C1234); + // Check store to integers + VEC CStep = VEC::Iota(-130, 131); + unsigned char ucStepExp[] = {126, 1, 132, 7, 138, 13, 144, 19, 150, 25, 156, 31, 162, 37, 168, 43}; + unsigned char ucStepGot[VEC::elements] = {}; + CStep.store(ucStepGot); + for (int i = 0; i < VEC::elements; ++i) + OIIO_CHECK_EQUAL ((int)ucStepGot[i], (int)ucStepExp[i]); + benchmark ("load from int[]", [](const int *d){ return VEC(d); }, i1234); benchmark ("load from unsigned short[]", [](const unsigned short *d){ return VEC(d); }, us1234); benchmark ("load from short[]", [](const short *d){ return VEC(d); }, s1234);