diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp index 2332f2bea4eddc..8991fbe7b57877 100644 --- a/deps/simdutf/simdutf.cpp +++ b/deps/simdutf/simdutf.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2025-03-17 16:12:36 -0400. Do not edit! */ +/* auto-generated on 2025-04-03 18:47:20 -0400. Do not edit! */ /* begin file src/simdutf.cpp */ #include "simdutf.h" @@ -2378,6 +2378,61 @@ const uint8_t pack_1_2_3_utf8_bytes[256][17] = { #endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H /* end file src/tables/utf16_to_utf8_tables.h */ +/* begin file src/tables/utf32_to_utf16_tables.h */ +// file generated by scripts/sse_convert_utf32_to_utf16.py +#ifndef SIMDUTF_UTF32_TO_UTF16_TABLES_H +#define SIMDUTF_UTF32_TO_UTF16_TABLES_H + +namespace simdutf { +namespace { +namespace tables { +namespace utf32_to_utf16 { + +const uint8_t pack_utf32_to_utf16le[16][16] = { + {0, 1, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80}, + {0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80}, + {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80}, + {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, +}; + +const uint8_t pack_utf32_to_utf16be[16][16] = { + {1, 0, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80}, + {1, 0, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80}, + {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, +}; + +} // namespace utf32_to_utf16 +} // namespace tables +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H +/* end file src/tables/utf32_to_utf16_tables.h */ // End of tables. // Implementations: they need to be setup before including @@ -2685,6 +2740,7 @@ class implementation final : public simdutf::implementation { /* begin file src/simdutf/arm64/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "arm64" // #define SIMDUTF_IMPLEMENTATION arm64 +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 /* end file src/simdutf/arm64/begin.h */ // Declarations @@ -2808,7 +2864,15 @@ template struct simd8; template > struct base_u8 { uint8x16_t value; static const int SIZE = sizeof(value); - + void dump() const { + uint8_t temp[16]; + vst1q_u8(temp, *this); + printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x,%04x, %04x, %04x, " + "%04x, %04x, %04x, %04x, %04x]\n", + temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], + temp[7], temp[8], temp[9], temp[10], temp[11], temp[12], temp[13], + temp[14], temp[15]); + } // Conversion from/to SIMD register simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {} simdutf_really_inline operator const uint8x16_t &() const { @@ -3054,6 +3118,7 @@ template <> struct simd8 : base_u8 { template simdutf_really_inline simd8 shl() const { return vshlq_n_u8(*this, N); } + simdutf_really_inline uint16_t sum_bytes() const { return vaddvq_u8(*this); } // Perform a lookup assuming the value is between 0 and 16 (undefined behavior // for out of range values) @@ -3084,6 +3149,7 @@ template <> struct simd8 : base_u8 { // Signed bytes template <> struct simd8 { int8x16_t value; + static const int SIZE = sizeof(value); static simdutf_really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); @@ -3454,8 +3520,10 @@ template struct simd16; template > struct base_u16 { uint16x8_t value; + /// the size of vector in bytes static const int SIZE = sizeof(value); - + /// the number of elements of type T a vector can hold + static const int ELEMENTS = SIZE / sizeof(T); // Conversion from/to SIMD register simdutf_really_inline base_u16() = default; simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {} @@ -3515,7 +3583,12 @@ struct base16 : base_u16 { simdutf_really_inline base16(const Pointer *ptr) : base16(vld1q_u16(ptr)) {} static const int SIZE = sizeof(base_u16::value); - + void dump() const { + uint16_t temp[8]; + vst1q_u16(temp, *this); + printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0], + temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]); + } template simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { return vextq_u18(prev_chunk, *this, 8 - N); @@ -3558,10 +3631,10 @@ template struct base16_numeric : base16 { // Addition/subtraction are the same for signed and unsigned simdutf_really_inline simd16 operator+(const simd16 other) const { - return vaddq_u8(*this, other); + return vaddq_u16(*this, other); } simdutf_really_inline simd16 operator-(const simd16 other) const { - return vsubq_u8(*this, other); + return vsubq_u16(*this, other); } simdutf_really_inline simd16 &operator+=(const simd16 other) { *this = *this + other; @@ -3723,6 +3796,14 @@ template <> struct simd16 : base16_numeric { simdutf_really_inline simd16 swap_bytes() const { return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this))); } + void dump() const { + uint16_t temp[8]; + vst1q_u16(temp, *this); + printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0], + temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]); + } + + simdutf_really_inline uint32_t sum() const { return vaddlvq_u16(value); } }; simdutf_really_inline simd16::operator simd16() const { return this->value; @@ -3857,7 +3938,180 @@ simdutf_really_inline uint64_t simd16x32::not_in_range( (this->chunks[3] < mask_low))); return x.to_bitmask(); } + +simdutf_really_inline simd16 min(const simd16 a, + simd16 b) { + return vminq_u16(a.value, b.value); +} /* end file src/simdutf/arm64/simd16-inl.h */ +/* begin file src/simdutf/arm64/simd32-inl.h */ +template struct simd32; + +template <> struct simd32 { + static const size_t SIZE = sizeof(uint32x4_t); + static const size_t ELEMENTS = SIZE / sizeof(uint32_t); + + uint32x4_t value; + + simdutf_really_inline simd32(const uint32x4_t v) : value(v) {} + + template + simdutf_really_inline simd32(const Pointer *ptr) + : value(vld1q_u32(reinterpret_cast(ptr))) {} + + simdutf_really_inline uint64_t sum() const { return vaddvq_u32(value); } + + simdutf_really_inline simd32 swap_bytes() const { + return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(value))); + } + + template simdutf_really_inline simd32 shr() const { + return vshrq_n_u32(value, N); + } + + template simdutf_really_inline simd32 shl() const { + return vshlq_n_u32(value, N); + } + + void dump() const { + uint32_t temp[4]; + vst1q_u32(temp, value); + printf("[%08x, %08x, %08x, %08x]\n", temp[0], temp[1], temp[2], temp[3]); + } + + // operators + simdutf_really_inline simd32 &operator+=(const simd32 other) { + value = vaddq_u32(value, other.value); + return *this; + } + + // static members + simdutf_really_inline static simd32 zero() { + return vdupq_n_u32(0); + } + + simdutf_really_inline static simd32 splat(uint32_t v) { + return vdupq_n_u32(v); + } +}; + +//---------------------------------------------------------------------- + +template <> struct simd32 { + uint32x4_t value; + + simdutf_really_inline simd32(const uint32x4_t v) : value(v) {} + + simdutf_really_inline bool any() const { return vmaxvq_u32(value) != 0; } +}; + +//---------------------------------------------------------------------- + +template +simdutf_really_inline simd32 operator|(const simd32 a, + const simd32 b) { + return vorrq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 min(const simd32 a, + const simd32 b) { + return vminq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 max(const simd32 a, + const simd32 b) { + return vmaxq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator==(const simd32 a, + uint32_t b) { + return vceqq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return vandq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator&(const simd32 a, + uint32_t b) { + return vandq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator|(const simd32 a, + uint32_t b) { + return vorrq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator+(const simd32 a, + const simd32 b) { + return vaddq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator-(const simd32 a, + uint32_t b) { + return vsubq_u32(a.value, vdupq_n_u32(b)); +} + +simdutf_really_inline simd32 operator>=(const simd32 a, + const simd32 b) { + return vcgeq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 operator!(const simd32 v) { + return vmvnq_u32(v.value); +} + +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return vcgtq_u32(a.value, b.value); +} + +simdutf_really_inline simd32 select(const simd32 cond, + const simd32 v_true, + const simd32 v_false) { + return vbslq_u32(cond.value, v_true.value, v_false.value); +} +/* end file src/simdutf/arm64/simd32-inl.h */ +/* begin file src/simdutf/arm64/simd64-inl.h */ +template struct simd64; + +template <> struct simd64 { + uint64x2_t value; + + simdutf_really_inline simd64(const uint64x2_t v) : value(v) {} + + template + simdutf_really_inline simd64(const Pointer *ptr) + : value(vld1q_u64(reinterpret_cast(ptr))) {} + + simdutf_really_inline uint64_t sum() const { return vaddvq_u64(value); } + + // operators + simdutf_really_inline simd64 &operator+=(const simd64 other) { + value = vaddq_u64(value, other.value); + return *this; + } + + // static members + simdutf_really_inline static simd64 zero() { + return vdupq_n_u64(0); + } + + simdutf_really_inline static simd64 splat(uint64_t v) { + return vdupq_n_u64(v); + } +}; +/* end file src/simdutf/arm64/simd64-inl.h */ + +simdutf_really_inline simd64 sum_8bytes(const simd8 v) { + // We do it as 3 instructions. There might be a faster way. + // We hope that these 3 instructions are cheap. + uint16x8_t first_sum = vpaddlq_u8(v); + uint32x4_t second_sum = vpaddlq_u16(first_sum); + return vpaddlq_u32(second_sum); +} + } // namespace simd } // unnamed namespace } // namespace arm64 @@ -4442,6 +4696,98 @@ namespace icelake { namespace { namespace simd { +/* begin file src/simdutf/icelake/simd16-inl.h */ +template struct simd16; + +template <> struct simd16 { + static const size_t SIZE = sizeof(__m512i); + static const size_t ELEMENTS = SIZE / sizeof(uint16_t); + + template + static simdutf_really_inline simd16 load(const Pointer *ptr) { + return simd16(ptr); + } + + __m512i value; + + simdutf_really_inline simd16(const __m512i v) : value(v) {} + + template + simdutf_really_inline simd16(const Pointer *ptr) + : value(_mm512_loadu_si512(reinterpret_cast(ptr))) {} + + // operators + simdutf_really_inline simd16 &operator+=(const simd16 other) { + value = _mm512_add_epi32(value, other.value); + return *this; + } + + simdutf_really_inline simd16 &operator-=(const simd16 other) { + value = _mm512_sub_epi32(value, other.value); + return *this; + } + + // methods + simdutf_really_inline simd16 swap_bytes() const { + const __m512i byteflip = _mm512_setr_epi64( + 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, + 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, + 0x0607040502030001, 0x0e0f0c0d0a0b0809); + + return _mm512_shuffle_epi8(value, byteflip); + } + + simdutf_really_inline uint64_t sum() const { + const auto lo = _mm512_and_si512(value, _mm512_set1_epi32(0xffff)); + const auto hi = _mm512_srli_epi32(value, 16); + const auto sum32 = _mm512_add_epi32(lo, hi); + + return _mm512_reduce_add_epi32(sum32); + } + + // static members + simdutf_really_inline static simd16 zero() { + return _mm512_setzero_si512(); + } + + simdutf_really_inline static simd16 splat(uint16_t v) { + return _mm512_set1_epi16(v); + } +}; + +template <> struct simd16 { + __mmask32 value; + + simdutf_really_inline simd16(const __mmask32 v) : value(v) {} +}; + +// ------------------------------------------------------------ + +simdutf_really_inline simd16 min(const simd16 b, + const simd16 a) { + return _mm512_min_epu16(a.value, b.value); +} + +simdutf_really_inline simd16 operator&(const simd16 a, + uint16_t b) { + return _mm512_and_si512(a.value, _mm512_set1_epi16(b)); +} + +simdutf_really_inline simd16 operator^(const simd16 a, + uint16_t b) { + return _mm512_xor_si512(a.value, _mm512_set1_epi16(b)); +} + +simdutf_really_inline simd16 operator^(const simd16 a, + const simd16 b) { + return _mm512_xor_si512(a.value, b.value); +} + +simdutf_really_inline simd16 operator==(const simd16 a, + uint16_t b) { + return _mm512_cmpeq_epi16_mask(a.value, _mm512_set1_epi16(b)); +} +/* end file src/simdutf/icelake/simd16-inl.h */ /* begin file src/simdutf/icelake/simd32-inl.h */ template struct simd32; @@ -4939,7 +5285,7 @@ SIMDUTF_POP_DISABLE_WARNINGS /* begin file src/simdutf/haswell/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "haswell" // #define SIMDUTF_IMPLEMENTATION haswell -#define SIMDUTF_SIMD_HAS_BYTEMASK +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL // nothing needed. @@ -6436,7 +6782,7 @@ SIMDUTF_POP_DISABLE_WARNINGS /* begin file src/simdutf/westmere/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "westmere" // #define SIMDUTF_IMPLEMENTATION westmere -#define SIMDUTF_SIMD_HAS_BYTEMASK +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE // nothing needed. @@ -6834,7 +7180,6 @@ template <> struct simd8 : base8_numeric { simdutf_really_inline uint64_t sum_bytes() const { const auto tmp = _mm_sad_epu8(value, _mm_setzero_si128()); - return _mm_extract_epi64(tmp, 0) + _mm_extract_epi64(tmp, 1); } }; @@ -7398,6 +7743,14 @@ template <> struct simd32 { return _mm_shuffle_epi8(value, shuffle); } + template simdutf_really_inline simd32 shr() const { + return _mm_srli_epi32(value, N); + } + + template simdutf_really_inline simd32 shl() const { + return _mm_slli_epi32(value, N); + } + void dump() const { printf("[%08x, %08x, %08x, %08x]\n", uint32_t(_mm_extract_epi32(value, 0)), uint32_t(_mm_extract_epi32(value, 1)), @@ -7434,6 +7787,10 @@ template <> struct simd32 { simdutf_really_inline bool any() const { return _mm_movemask_epi8(value) != 0; } + + simdutf_really_inline uint8_t to_4bit_bitmask() const { + return uint8_t(_mm_movemask_ps(_mm_castsi128_ps(value))); + } }; //---------------------------------------------------------------------- @@ -7454,16 +7811,36 @@ simdutf_really_inline simd32 max(const simd32 a, return _mm_max_epu32(a.value, b.value); } +simdutf_really_inline simd32 operator==(const simd32 a, + uint32_t b) { + return _mm_cmpeq_epi32(a.value, _mm_set1_epi32(b)); +} + simdutf_really_inline simd32 operator&(const simd32 a, const simd32 b) { return _mm_and_si128(a.value, b.value); } +simdutf_really_inline simd32 operator&(const simd32 a, + uint32_t b) { + return _mm_and_si128(a.value, _mm_set1_epi32(b)); +} + +simdutf_really_inline simd32 operator|(const simd32 a, + uint32_t b) { + return _mm_or_si128(a.value, _mm_set1_epi32(b)); +} + simdutf_really_inline simd32 operator+(const simd32 a, const simd32 b) { return _mm_add_epi32(a.value, b.value); } +simdutf_really_inline simd32 operator-(const simd32 a, + uint32_t b) { + return _mm_sub_epi32(a.value, _mm_set1_epi32(b)); +} + simdutf_really_inline simd32 operator>=(const simd32 a, const simd32 b) { return _mm_cmpeq_epi32(_mm_max_epu32(a.value, b.value), a.value); @@ -7477,6 +7854,12 @@ simdutf_really_inline simd32 operator>(const simd32 a, const simd32 b) { return !(b >= a); } + +simdutf_really_inline simd32 select(const simd32 cond, + const simd32 v_true, + const simd32 v_false) { + return _mm_blendv_epi8(v_false.value, v_true.value, cond.value); +} /* end file src/simdutf/westmere/simd32-inl.h */ /* begin file src/simdutf/westmere/simd64-inl.h */ template struct simd64; @@ -7518,6 +7901,10 @@ simdutf_really_inline simd64 sum_8bytes(const simd8 v) { return _mm_sad_epu8(v.value, simd8::zero()); } +simdutf_really_inline simd8 as_vector_u8(const simd32 v) { + return simd8(v.value); +} + } // namespace simd } // unnamed namespace } // namespace westmere @@ -10053,6 +10440,7 @@ class implementation final : public simdutf::implementation { /* begin file src/simdutf/lsx/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "lsx" // #define SIMDUTF_IMPLEMENTATION lsx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 /* end file src/simdutf/lsx/begin.h */ // Declarations @@ -10065,6 +10453,190 @@ class implementation final : public simdutf::implementation { // you use visual studio or other compilers. #include +/* +Encoding of argument for LoongArch64 xvldi instruction. See: +https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm + +1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes + +2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes + +3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes + +4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes + +5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes + +6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes + +7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all +lanes + +8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to +all lanes + +9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes + +10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast +the result as 64-bit elements to all lanes +*/ + +namespace vldi { + +template class const_u16 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); + + constexpr static bool is_case5 = uint16_t(b0) == v; + constexpr static bool is_case6 = (uint16_t(b1) << 8) == v; + constexpr static bool is_case9 = (b0 == b1); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)); + +public: + constexpr static uint16_t operation = is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case9 ? 0b11000 + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case5 ? b0 + : is_case6 ? b1 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; + +template class const_u32 { + constexpr static const uint8_t b0 = (v & 0xff); + constexpr static const uint8_t b1 = ((v >> 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 16) & 0xff); + constexpr static const uint8_t b3 = ((v >> 24) & 0xff); + + constexpr static bool is_case1 = (uint32_t(b0) == v); + constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v); + constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v); + constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v); + constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0); + constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0); + constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff); + constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff); + constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)); + +public: + constexpr static uint16_t operation = is_case1 ? 0b10000 + : is_case2 ? 0b10001 + : is_case3 ? 0b10010 + : is_case4 ? 0b10011 + : is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case7 ? 0b10110 + : is_case8 ? 0b10111 + : is_case9 ? 0b11000 + : is_case10 ? 0b11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case1 ? b0 + : is_case2 ? b1 + : is_case3 ? b2 + : is_case4 ? b3 + : is_case5 ? b0 + : is_case6 ? b1 + : is_case7 ? b1 + : is_case8 ? b2 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) | + (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; + +template class const_u64 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff); + constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff); + constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff); + constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff); + constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff); + constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff); + + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) && + ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) && + ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00)); + +public: + constexpr static bool is_32bit = + ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value; + constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation; + constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte; + + constexpr static uint16_t operation = is_32bit ? op_32bit + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_32bit ? byte_32bit + : is_case10 + ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) | + (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) | + (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; +} // namespace vldi + +// Uncomment when running under QEMU affected +// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865 +// Versions <= 9.2.2 are affected, likely anything newer is correct. +#ifndef QEMU_VLDI_BUG +// #define QEMU_VLDI_BUG 1 +#endif + +#ifdef QEMU_VLDI_BUG + #define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v) + #define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v) +#else +template constexpr __m128i lsx_splat_u16_aux() { + constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512); + constexpr uint16_t imm10 = is_imm10 ? x : 0; + constexpr bool is_vldi = vldi::const_u16::valid; + constexpr int vldi_imm = is_vldi ? vldi::const_u16::value : 0; + + return is_imm10 ? __lsx_vrepli_h(int16_t(imm10)) + : is_vldi ? __lsx_vldi(vldi_imm) + : __lsx_vreplgr2vr_h(x); +} + +template constexpr __m128i lsx_splat_u32_aux() { + constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512); + constexpr uint32_t imm10 = is_imm10 ? x : 0; + constexpr bool is_vldi = vldi::const_u32::valid; + constexpr int vldi_imm = is_vldi ? vldi::const_u32::value : 0; + + return is_imm10 ? __lsx_vrepli_w(int32_t(imm10)) + : is_vldi ? __lsx_vldi(vldi_imm) + : __lsx_vreplgr2vr_w(x); +} + + #define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>() + #define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>() +#endif // QEMU_VLDI_BUG + #endif // SIMDUTF_LSX_INTRINSICS_H /* end file src/simdutf/lsx/intrinsics.h */ /* begin file src/simdutf/lsx/bitmanipulation.h */ @@ -10685,6 +11257,7 @@ template struct simd8x64 { .to_bitmask(); } }; // struct simd8x64 + /* begin file src/simdutf/lsx/simd16-inl.h */ template struct simd16; @@ -11065,6 +11638,78 @@ simdutf_really_inline uint64_t simd16x32::not_in_range( return x.to_bitmask(); } /* end file src/simdutf/lsx/simd16-inl.h */ +/* begin file src/simdutf/lsx/simd32-inl.h */ +template struct simd32; + +template <> struct simd32 { + __m128i value; + static const int SIZE = sizeof(value); + static const int ELEMENTS = SIZE / sizeof(uint32_t); + + // constructors + simdutf_really_inline simd32(__m128i v) : value(v) {} + + template + simdutf_really_inline simd32(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {} + + // in-place operators + simdutf_really_inline simd32 &operator-=(const simd32 other) { + value = __lsx_vsub_w(value, other.value); + return *this; + } + + // members + simdutf_really_inline uint64_t sum() const { + return uint64_t(__lsx_vpickve2gr_wu(value, 0)) + + uint64_t(__lsx_vpickve2gr_wu(value, 1)) + + uint64_t(__lsx_vpickve2gr_wu(value, 2)) + + uint64_t(__lsx_vpickve2gr_wu(value, 3)); + } + + // static members + static simdutf_really_inline simd32 splat(uint32_t x) { + return __lsx_vreplgr2vr_w(x); + } + + static simdutf_really_inline simd32 zero() { + return __lsx_vrepli_w(0); + } +}; + +// ------------------------------------------------------------ + +template <> struct simd32 { + __m128i value; + static const int SIZE = sizeof(value); + + // constructors + simdutf_really_inline simd32(__m128i v) : value(v) {} +}; + +// ------------------------------------------------------------ + +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return __lsx_vor_v(a.value, b.value); +} + +simdutf_really_inline simd32 operator<(const simd32 a, + const simd32 b) { + return __lsx_vslt_wu(a.value, b.value); +} + +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return __lsx_vslt_wu(b.value, a.value); +} + +// ------------------------------------------------------------ + +simdutf_really_inline simd32 as_vector_u32(const simd32 v) { + return v.value; +} +/* end file src/simdutf/lsx/simd32-inl.h */ + } // namespace simd } // unnamed namespace } // namespace lsx @@ -11074,6 +11719,7 @@ simdutf_really_inline uint64_t simd16x32::not_in_range( /* end file src/simdutf/lsx/simd.h */ /* begin file src/simdutf/lsx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP /* end file src/simdutf/lsx/end.h */ #endif // SIMDUTF_IMPLEMENTATION_LSX @@ -11382,6 +12028,7 @@ class implementation final : public simdutf::implementation { /* begin file src/simdutf/lasx/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "lasx" // #define SIMDUTF_IMPLEMENTATION lasx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 /* end file src/simdutf/lasx/begin.h */ // Declarations @@ -11485,6 +12132,191 @@ static inline __m128i lasx_extracti128_hi(__m256i in) { } #endif +/* +Encoding of argument for LoongArch64 xvldi instruction. See: +https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm + +1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes + +2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes + +3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes + +4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes + +5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes + +6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes + +7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all +lanes + +8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to +all lanes + +9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes + +10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast +the result as 64-bit elements to all lanes +*/ + +namespace lasx_vldi { + +template class const_u16 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); + + constexpr static bool is_case5 = uint16_t(b0) == v; + constexpr static bool is_case6 = (uint16_t(b1) << 8) == v; + constexpr static bool is_case9 = (b0 == b1); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)); + +public: + constexpr static uint16_t operation = is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case9 ? 0b11000 + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case5 ? b0 + : is_case6 ? b1 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; + +template class const_u32 { + constexpr static const uint8_t b0 = (v & 0xff); + constexpr static const uint8_t b1 = ((v >> 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 16) & 0xff); + constexpr static const uint8_t b3 = ((v >> 24) & 0xff); + + constexpr static bool is_case1 = (uint32_t(b0) == v); + constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v); + constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v); + constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v); + constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0); + constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0); + constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff); + constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff); + constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3); + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)); + +public: + constexpr static uint16_t operation = is_case1 ? 0b10000 + : is_case2 ? 0b10001 + : is_case3 ? 0b10010 + : is_case4 ? 0b10011 + : is_case5 ? 0b10100 + : is_case6 ? 0b10101 + : is_case7 ? 0b10110 + : is_case8 ? 0b10111 + : is_case9 ? 0b11000 + : is_case10 ? 0b11001 + : 0xffff; + + constexpr static uint16_t byte = + is_case1 ? b0 + : is_case2 ? b1 + : is_case3 ? b2 + : is_case4 ? b3 + : is_case5 ? b0 + : is_case6 ? b1 + : is_case7 ? b1 + : is_case8 ? b2 + : is_case9 ? b0 + : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) | + (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; + +template class const_u64 { + constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff); + constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff); + constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff); + constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff); + constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff); + constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff); + constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff); + constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff); + + constexpr static bool is_case10 = + ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) && + ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) && + ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) && + ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00)); + +public: + constexpr static bool is_32bit = + ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value; + constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation; + constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte; + + constexpr static uint16_t operation = is_32bit ? op_32bit + : is_case10 ? 0x11001 + : 0xffff; + + constexpr static uint16_t byte = + is_32bit ? byte_32bit + : is_case10 + ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) | + (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) | + (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00)) + : 0xffff; + + constexpr static int value = int((operation << 8) | byte) - 8192; + constexpr static bool valid = operation != 0xffff; +}; + +} // namespace lasx_vldi + +// Uncomment when running under QEMU affected +// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865 +// Versions <= 9.2.2 are affected, likely anything newer is correct. +#ifndef QEMU_VLDI_BUG +// #define QEMU_VLDI_BUG 1 +#endif + +#ifdef QEMU_VLDI_BUG + #define lasx_splat_u16(v) __lasx_xvreplgr2vr_h(v) + #define lasx_splat_u32(v) __lasx_xvreplgr2vr_w(v) +#else +template constexpr __m256i lasx_splat_u16_aux() { + constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512); + constexpr uint16_t imm10 = is_imm10 ? x : 0; + constexpr bool is_vldi = lasx_vldi::const_u16::valid; + constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u16::value : 0; + + return is_imm10 ? __lasx_xvrepli_h(int16_t(imm10)) + : is_vldi ? __lasx_xvldi(vldi_imm) + : __lasx_xvreplgr2vr_h(x); +} + +template constexpr __m256i lasx_splat_u32_aux() { + constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512); + constexpr uint32_t imm10 = is_imm10 ? x : 0; + constexpr bool is_vldi = lasx_vldi::const_u32::valid; + constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u32::value : 0; + + return is_imm10 ? __lasx_xvrepli_w(int32_t(imm10)) + : is_vldi ? __lasx_xvldi(vldi_imm) + : __lasx_xvreplgr2vr_w(x); +} + + #define lasx_splat_u16(v) lasx_splat_u16_aux<(v)>() + #define lasx_splat_u32(v) lasx_splat_u32_aux<(v)>() +#endif // QEMU_VLDI_BUG + #endif // SIMDUTF_LASX_INTRINSICS_H /* end file src/simdutf/lasx/intrinsics.h */ /* begin file src/simdutf/lasx/bitmanipulation.h */ @@ -12569,6 +13401,83 @@ template struct simd16x32 { } }; // struct simd16x32 /* end file src/simdutf/lasx/simd16-inl.h */ +/* begin file src/simdutf/lasx/simd32-inl.h */ +template struct simd32; + +template <> struct simd32 { + __m256i value; + static const int SIZE = sizeof(value); + static const int ELEMENTS = SIZE / sizeof(uint32_t); + + // constructors + simdutf_really_inline simd32(__m256i v) : value(v) {} + + template + simdutf_really_inline simd32(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {} + + // in-place operators + simdutf_really_inline simd32 &operator-=(const simd32 other) { + value = __lasx_xvsub_w(value, other.value); + return *this; + } + + // members + simdutf_really_inline uint64_t sum() const { + const auto odd = __lasx_xvsrli_d(value, 32); + const auto even = __lasx_xvand_v(value, __lasx_xvreplgr2vr_d(0xffffffff)); + + const auto sum64 = __lasx_xvadd_d(odd, even); + + return uint64_t(__lasx_xvpickve2gr_du(sum64, 0)) + + uint64_t(__lasx_xvpickve2gr_du(sum64, 1)) + + uint64_t(__lasx_xvpickve2gr_du(sum64, 2)) + + uint64_t(__lasx_xvpickve2gr_du(sum64, 3)); + } + + // static members + static simdutf_really_inline simd32 splat(uint32_t x) { + return __lasx_xvreplgr2vr_w(x); + } + + static simdutf_really_inline simd32 zero() { + return __lasx_xvrepli_w(0); + } +}; + +// ------------------------------------------------------------ + +template <> struct simd32 { + __m256i value; + static const int SIZE = sizeof(value); + + // constructors + simdutf_really_inline simd32(__m256i v) : value(v) {} +}; + +// ------------------------------------------------------------ + +simdutf_really_inline simd32 operator&(const simd32 a, + const simd32 b) { + return __lasx_xvor_v(a.value, b.value); +} + +simdutf_really_inline simd32 operator<(const simd32 a, + const simd32 b) { + return __lasx_xvslt_wu(a.value, b.value); +} + +simdutf_really_inline simd32 operator>(const simd32 a, + const simd32 b) { + return __lasx_xvslt_wu(b.value, a.value); +} + +// ------------------------------------------------------------ + +simdutf_really_inline simd32 as_vector_u32(const simd32 v) { + return v.value; +} +/* end file src/simdutf/lasx/simd32-inl.h */ + } // namespace simd } // unnamed namespace } // namespace lasx @@ -12578,6 +13487,7 @@ template struct simd16x32 { /* end file src/simdutf/lasx/simd.h */ /* begin file src/simdutf/lasx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP /* end file src/simdutf/lasx/end.h */ #endif // SIMDUTF_IMPLEMENTATION_LASX @@ -18563,6 +19473,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS /* begin file src/simdutf/arm64/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "arm64" // #define SIMDUTF_IMPLEMENTATION arm64 +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 /* end file src/simdutf/arm64/begin.h */ namespace simdutf { namespace arm64 { @@ -20439,6 +21350,64 @@ arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast(utf8_output)); } + +template +simdutf_really_inline size_t +arm64_utf8_length_from_utf16_bytemask(const char16_t *in, size_t size) { + size_t pos = 0; + + constexpr size_t N = 8; + const auto one = vmovq_n_u16(1); + // each char16 yields at least one byte + size_t count = size / N * N; + + for (; pos < size / N * N; pos += N) { + auto input = vld1q_u16(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { + input = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(input))); + } + // 0xd800 .. 0xdbff - low surrogate + // 0xdc00 .. 0xdfff - high surrogate + const auto is_surrogate = + vceqq_u16(vandq_u16(input, vmovq_n_u16(0xf800)), vmovq_n_u16(0xd800)); + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = vminq_u16(vandq_u16(input, vmovq_n_u16(0xff80)), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = vminq_u16(vandq_u16(input, vmovq_n_u16(0xf800)), one); + + /* + Explanation how the counting works. + + In the case of a non-surrogate character we count: + * always 1 -- see how `count` is initialized above; + * c0 = 1 if the current char yields 2 or 3 bytes; + * c1 = 1 if the current char yields 3 bytes. + + Thus, we always have correct count for the current char: + from 1, 2 or 3 bytes. + + A trickier part is how we count surrogate pairs. Whether + we encounter a surrogate (low or high), we count it as + 3 chars and then minus 1 (`is_surrogate` is -1 or 0). + Each surrogate char yields 2. A surrogate pair, that + is a low surrogate followed by a high one, yields + the expected 4 bytes. + + It also correctly handles cases when low surrogate is + processed by the this loop, but high surrogate is counted + by the scalar procedure. The scalar procedure uses exactly + the described approach, thanks to that for valid UTF-16 + strings it always count correctly. + */ + auto v_count = vaddq_u16(c1, c0); + v_count = vaddq_u16(v_count, is_surrogate); + count += vaddlvq_u16(v_count); + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8 @@ -20519,6 +21488,26 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, vst4q_u8(out, result); out += 64; } + + if (i + 24 <= srclen) { + const uint8x8_t v3f_d = vdup_n_u8(0x3f); + const uint8x8x3_t in = vld3_u8((const uint8_t *)src + i); + uint8x8x4_t result; + result.val[0] = vshr_n_u8(in.val[0], 2); + result.val[1] = + vand_u8(vsli_n_u8(vshr_n_u8(in.val[1], 4), in.val[0], 4), v3f_d); + result.val[2] = + vand_u8(vsli_n_u8(vshr_n_u8(in.val[2], 6), in.val[1], 2), v3f_d); + result.val[3] = vand_u8(in.val[2], v3f_d); + result.val[0] = vqtbl4_u8(table, result.val[0]); + result.val[1] = vqtbl4_u8(table, result.val[1]); + result.val[2] = vqtbl4_u8(table, result.val[2]); + result.val[3] = vqtbl4_u8(table, result.val[3]); + vst4_u8(out, result); + out += 32; + i += 24; + } + out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options); @@ -20763,11 +21752,9 @@ void load_block(block64 *b, const char16_t *src) { void base64_decode_block(char *out, const char *src) { uint8x16x4_t str = vld4q_u8((uint8_t *)src); uint8x16x3_t outvec; - outvec.val[0] = - vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4)); - outvec.val[1] = - vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2)); - outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]); + outvec.val[0] = vsliq_n_u8(vshrq_n_u8(str.val[1], 4), str.val[0], 2); + outvec.val[1] = vsliq_n_u8(vshrq_n_u8(str.val[2], 2), str.val[1], 4); + outvec.val[2] = vsliq_n_u8(str.val[3], str.val[2], 6); vst3q_u8((uint8_t *)out, outvec); } @@ -23109,7 +24096,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (!match_system(big_endian)) { input = input.swap_bytes(); } - // 0xd800 .. 0xdbff - low surrogate // 0xdc00 .. 0xdfff - high surrogate const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); @@ -23152,7 +24138,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (iteration == 0) { count += v_count.sum(); v_count = vector_u16::zero(); - iteration = max_iterations; } } @@ -24597,12 +25582,13 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1( #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( const char16_t *input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); + return arm64_utf8_length_from_utf16_bytemask(input, + length); } simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( const char16_t *input, size_t length) const noexcept { - return utf16::utf8_length_from_utf16(input, length); + return arm64_utf8_length_from_utf16_bytemask(input, length); } #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 @@ -27604,6 +28590,92 @@ simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, } } /* end file src/icelake/icelake_convert_utf8_to_utf16.inl.cpp */ +/* begin file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */ +// This is translation of `utf8_length_from_utf16_bytemask` from +// `generic/utf16.h` +template +simdutf_really_inline size_t icelake_utf8_length_from_utf16(const char16_t *in, + size_t size) { + size_t pos = 0; + + using vector_u16 = simd16; + constexpr size_t N = vector_u16::ELEMENTS; + + const auto one = vector_u16::splat(1); + + auto v_count = vector_u16::zero(); + + // each char16 yields at least one byte + size_t count = size / N * N; + + // in a single iteration the increment is 0, 1 or 2, despite we have + // three additions + constexpr size_t max_iterations = 65535 / 2; + size_t iteration = max_iterations; + + for (; pos < size / N * N; pos += N) { + auto input = vector_u16::load(reinterpret_cast(in + pos)); + if (!match_system(big_endian)) { + input = input.swap_bytes(); + } + + // not_surrogate[i] = non-zero if i-th element is not a surrogate word + const auto not_surrogate = (input & uint16_t(0xf800)) ^ uint16_t(0xd800); + + // not_surrogate[i] = 1 if surrogate word, 0 otherwise + const auto is_surrogate = min(not_surrogate, one) ^ one; + + // c0 - chars that yield 2- or 3-byte UTF-8 codes + const auto c0 = min(input & uint16_t(0xff80), one); + + // c1 - chars that yield 3-byte UTF-8 codes (including surrogates) + const auto c1 = min(input & uint16_t(0xf800), one); + + /* + Explanation how the counting works. + + In the case of a non-surrogate character we count: + * always 1 -- see how `count` is initialized above; + * c0 = 1 if the current char yields 2 or 3 bytes; + * c1 = 1 if the current char yields 3 bytes. + + Thus, we always have correct count for the current char: + from 1, 2 or 3 bytes. + + A trickier part is how we count surrogate pairs. Whether + we encounter a surrogate (low or high), we count it as + 3 chars and then minus 1 (`is_surrogate` is -1 or 0). + Each surrogate char yields 2. A surrogate pair, that + is a low surrogate followed by a high one, yields + the expected 4 bytes. + + It also correctly handles cases when low surrogate is + processed by the this loop, but high surrogate is counted + by the scalar procedure. The scalar procedure uses exactly + the described approach, thanks to that for valid UTF-16 + strings it always count correctly. + */ + v_count += c0; + v_count += c1; + v_count -= is_surrogate; + + iteration -= 1; + if (iteration == 0) { + count += v_count.sum(); + v_count = vector_u16::zero(); + + iteration = max_iterations; + } + } + + if (iteration > 0) { + count += v_count.sum(); + } + + return count + scalar::utf16::utf8_length_from_utf16(in + pos, + size - pos); +} +/* end file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */ #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 @@ -29449,7 +30521,8 @@ namespace utf32 { template T min(T a, T b) { return a <= b ? a : b; } -size_t utf8_length_from_utf32(const char32_t *input, size_t length) { +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { using vector_u32 = simd32; const char32_t *start = input; @@ -29460,16 +30533,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const size_t N = vector_u32::ELEMENTS; - const auto one = vector_u32::splat(1); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else const auto v_ffffff80 = vector_u32::splat(0xffffff80); const auto v_fffff800 = vector_u32::splat(0xfffff800); const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP size_t counter = 0; // 1. vectorized loop unrolled 4 times { - // we use uint32 counters, this is + // we use vector of uint32 counters, this is why this limit is used const size_t max_iterations = std::numeric_limits::max() / (max_increment * 4); size_t blocks = length / (N * 4); @@ -29485,6 +30564,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const auto in2 = vector_u32(input + 2 * N); const auto in3 = vector_u32(input + 3 * N); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else acc += min(one, in0 & v_ffffff80); acc += min(one, in1 & v_ffffff80); acc += min(one, in2 & v_ffffff80); @@ -29499,6 +30594,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { acc += min(one, in1 & v_ffff0000); acc += min(one, in2 & v_ffff0000); acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += 4 * N; } @@ -29521,9 +30617,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { for (size_t i = 0; i < iterations; i++) { const auto in = vector_u32(input); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else acc += min(one, in & v_ffffff80); acc += min(one, in & v_fffff800); acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += N; } @@ -30840,83 +31942,12 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf8( #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le( const char16_t *input, size_t length) const noexcept { - const char16_t *ptr = input; - size_t count{0}; - if (length >= 32) { - const char16_t *end = input + length - 32; - - const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f); - const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff); - const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff); - const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); - - while (ptr <= end) { - __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr); - ptr += 32; - __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f); - __mmask32 two_bytes_bitmask = - _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff); - __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask); - __mmask32 surrogates_bitmask = - _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & - _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800); - - size_t ascii_count = count_ones(ascii_bitmask); - size_t two_bytes_count = count_ones(two_bytes_bitmask); - size_t surrogate_bytes_count = count_ones(surrogates_bitmask); - size_t three_bytes_count = - 32 - ascii_count - two_bytes_count - surrogate_bytes_count; - - count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + - 2 * surrogate_bytes_count; - } - } - - return count + scalar::utf16::utf8_length_from_utf16( - ptr, length - (ptr - input)); + return icelake_utf8_length_from_utf16(input, length); } simdutf_warn_unused size_t implementation::utf8_length_from_utf16be( const char16_t *input, size_t length) const noexcept { - const char16_t *ptr = input; - size_t count{0}; - - if (length >= 32) { - const char16_t *end = input + length - 32; - - const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f); - const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff); - const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff); - const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800); - - const __m512i byteflip = _mm512_setr_epi64( - 0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001, - 0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809, - 0x0607040502030001, 0x0e0f0c0d0a0b0809); - while (ptr <= end) { - __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr); - utf16 = _mm512_shuffle_epi8(utf16, byteflip); - ptr += 32; - __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f); - __mmask32 two_bytes_bitmask = - _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff); - __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask); - __mmask32 surrogates_bitmask = - _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & - _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800); - - size_t ascii_count = count_ones(ascii_bitmask); - size_t two_bytes_count = count_ones(two_bytes_bitmask); - size_t surrogate_bytes_count = count_ones(surrogates_bitmask); - size_t three_bytes_count = - 32 - ascii_count - two_bytes_count - surrogate_bytes_count; - count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + - 2 * surrogate_bytes_count; - } - } - - return count + scalar::utf16::utf8_length_from_utf16( - ptr, length - (ptr - input)); + return icelake_utf8_length_from_utf16(input, length); } #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16 @@ -31195,7 +32226,7 @@ SIMDUTF_POP_DISABLE_WARNINGS /* begin file src/simdutf/haswell/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "haswell" // #define SIMDUTF_IMPLEMENTATION haswell -#define SIMDUTF_SIMD_HAS_BYTEMASK +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL // nothing needed. @@ -35354,7 +36385,8 @@ namespace utf32 { template T min(T a, T b) { return a <= b ? a : b; } -size_t utf8_length_from_utf32(const char32_t *input, size_t length) { +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { using vector_u32 = simd32; const char32_t *start = input; @@ -35365,16 +36397,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const size_t N = vector_u32::ELEMENTS; - const auto one = vector_u32::splat(1); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else const auto v_ffffff80 = vector_u32::splat(0xffffff80); const auto v_fffff800 = vector_u32::splat(0xfffff800); const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP size_t counter = 0; // 1. vectorized loop unrolled 4 times { - // we use uint32 counters, this is + // we use vector of uint32 counters, this is why this limit is used const size_t max_iterations = std::numeric_limits::max() / (max_increment * 4); size_t blocks = length / (N * 4); @@ -35390,6 +36428,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const auto in2 = vector_u32(input + 2 * N); const auto in3 = vector_u32(input + 3 * N); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else acc += min(one, in0 & v_ffffff80); acc += min(one, in1 & v_ffffff80); acc += min(one, in2 & v_ffffff80); @@ -35404,6 +36458,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { acc += min(one, in1 & v_ffff0000); acc += min(one, in2 & v_ffff0000); acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += 4 * N; } @@ -35426,9 +36481,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { for (size_t i = 0; i < iterations; i++) { const auto in = vector_u32(input); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else acc += min(one, in & v_ffffff80); acc += min(one, in & v_fffff800); acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += N; } @@ -35627,7 +36688,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (!match_system(big_endian)) { input = input.swap_bytes(); } - // 0xd800 .. 0xdbff - low surrogate // 0xdc00 .. 0xdfff - high surrogate const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); @@ -35670,7 +36730,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (iteration == 0) { count += v_count.sum(); v_count = vector_u16::zero(); - iteration = max_iterations; } } @@ -42236,7 +43295,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (!match_system(big_endian)) { input = input.swap_bytes(); } - // 0xd800 .. 0xdbff - low surrogate // 0xdc00 .. 0xdfff - high surrogate const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); @@ -42279,7 +43337,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (iteration == 0) { count += v_count.sum(); v_count = vector_u16::zero(); - iteration = max_iterations; } } @@ -42465,7 +43522,8 @@ namespace utf32 { template T min(T a, T b) { return a <= b ? a : b; } -size_t utf8_length_from_utf32(const char32_t *input, size_t length) { +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { using vector_u32 = simd32; const char32_t *start = input; @@ -42476,16 +43534,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const size_t N = vector_u32::ELEMENTS; - const auto one = vector_u32::splat(1); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else const auto v_ffffff80 = vector_u32::splat(0xffffff80); const auto v_fffff800 = vector_u32::splat(0xfffff800); const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP size_t counter = 0; // 1. vectorized loop unrolled 4 times { - // we use uint32 counters, this is + // we use vector of uint32 counters, this is why this limit is used const size_t max_iterations = std::numeric_limits::max() / (max_increment * 4); size_t blocks = length / (N * 4); @@ -42501,6 +43565,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const auto in2 = vector_u32(input + 2 * N); const auto in3 = vector_u32(input + 3 * N); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else acc += min(one, in0 & v_ffffff80); acc += min(one, in1 & v_ffffff80); acc += min(one, in2 & v_ffffff80); @@ -42515,6 +43595,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { acc += min(one, in1 & v_ffff0000); acc += min(one, in2 & v_ffff0000); acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += 4 * N; } @@ -42537,9 +43618,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { for (size_t i = 0; i < iterations; i++) { const auto in = vector_u32(input); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else acc += min(one, in & v_ffffff80); acc += min(one, in & v_fffff800); acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += N; } @@ -44458,7 +45545,7 @@ simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src, /* validate first three bytes */ { size_t idx = 3; - while (idx < len && (src[idx] >> 6) == 0b10) + while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10) ++idx; if (idx > 3 + 3 || !scalar::utf8::validate(src, idx)) return 0; @@ -44525,7 +45612,7 @@ simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src, } /* we need to validate the last character */ - while (tail < len && (src[0] >> 6) == 0b10) + while (tail < len && (uint8_t(src[0]) >> 6) == 0b10) --src, ++tail; return src - beg; } @@ -45459,7 +46546,7 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, /* validate first three bytes */ if (validate) { size_t idx = 3; - while (idx < len && (src[idx] >> 6) == 0b10) + while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10) ++idx; if (idx > 3 + 3 || !scalar::utf8::validate(src, idx)) return 0; @@ -45694,9 +46781,9 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, /* validate the last character and reparse it + tail */ if (len > tail) { - if ((src[0] >> 6) == 0b10) + if ((uint8_t(src[0]) >> 6) == 0b10) --dst; - while ((src[0] >> 6) == 0b10 && tail < len) + while ((uint8_t(src[0]) >> 6) == 0b10 && tail < len) --src, ++tail; if (is16) { /* go back one more, when on high surrogate */ @@ -46156,7 +47243,7 @@ SIMDUTF_UNTARGET_REGION /* begin file src/simdutf/westmere/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "westmere" // #define SIMDUTF_IMPLEMENTATION westmere -#define SIMDUTF_SIMD_HAS_BYTEMASK +#define SIMDUTF_SIMD_HAS_BYTEMASK 1 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE // nothing needed. @@ -48323,6 +49410,60 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 /* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */ +struct expansion_result_t { + size_t u16count; + __m128i compressed; +}; + +// Function sse_expand_surrogate takes four **valid** UTF-32 characters +// having at least one code-point producing a surrogate pair. +template +expansion_result_t sse_expand_surrogate(const __m128i x) { + using vector_u32 = simd32; + using vector_u8 = simd8; + + const auto in = vector_u32(x); + + const auto non_surrogate_mask = (in & uint32_t(0xffff0000)) == uint32_t(0); + const auto mask = (~non_surrogate_mask.to_4bit_bitmask()) & 0xf; + + const auto t0 = in - uint32_t(0x00010000); + const auto hi = t0.shr<10>() & uint32_t(0x000003ff); + const auto lo = t0.shl<16>() & uint32_t(0x03ff0000); + const auto surrogates = (lo | hi) | uint32_t(0xdc00d800); + + const auto merged = as_vector_u8(select(non_surrogate_mask, in, surrogates)); + + const auto shuffle = vector_u8::load( + (byte_order == endianness::LITTLE) + ? tables::utf32_to_utf16::pack_utf32_to_utf16le[mask] + : tables::utf32_to_utf16::pack_utf32_to_utf16be[mask]); + + const size_t u16count = (4 + count_ones(mask)); + const auto compressed = shuffle.lookup_16(merged); + + return {u16count, compressed}; +} + +// Function `validate_utf32` checks 2 x 4 UTF-32 characters for their validity. +simdutf_really_inline bool validate_utf32(const __m128i a, const __m128i b) { + using vector_u32 = simd32; + + const auto in0 = vector_u32(a); + const auto in1 = vector_u32(b); + + const auto standardmax = vector_u32::splat(0x10ffff); + const auto offset = vector_u32::splat(0xffff2000); + const auto standardoffsetmax = vector_u32::splat(0xfffff7ff); + + const auto too_large = max(in0, in1) > standardmax; + const auto surrogate0 = (in0 + offset) > standardoffsetmax; + const auto surrogate1 = (in1 + offset) > standardoffsetmax; + + const auto combined = too_large | surrogate0 | surrogate1; + return !combined.any(); +} + template std::pair sse_convert_utf32_to_utf16(const char32_t *buf, size_t len, @@ -48333,66 +49474,61 @@ sse_convert_utf32_to_utf16(const char32_t *buf, size_t len, const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000); __m128i forbidden_bytemask = _mm_setzero_si128(); - while (end - buf >= 8) { - const __m128i in = _mm_loadu_si128((__m128i *)buf); - const __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1); + while (end - buf >= 16 + 8) { + const __m128i *ptr = reinterpret_cast(buf); + const __m128i in0 = _mm_loadu_si128(ptr + 0); + const __m128i in1 = _mm_loadu_si128(ptr + 1); + const __m128i in2 = _mm_loadu_si128(ptr + 2); + const __m128i in3 = _mm_loadu_si128(ptr + 3); - const __m128i combined = _mm_or_si128(in, nextin); + const __m128i combined = + _mm_or_si128(_mm_or_si128(in2, in3), _mm_or_si128(in0, in1)); if (simdutf_likely(_mm_testz_si128(combined, v_ffff0000))) { // No bits set above 16th, directly pack UTF-32 to UTF-16 - __m128i utf16_packed = _mm_packus_epi32(in, nextin); + __m128i utf16_packed0 = _mm_packus_epi32(in0, in1); + __m128i utf16_packed1 = _mm_packus_epi32(in2, in3); const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800); forbidden_bytemask = _mm_or_si128( forbidden_bytemask, - _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800)); + _mm_or_si128( + _mm_cmpeq_epi16(_mm_and_si128(utf16_packed0, v_f800), v_d800), + _mm_cmpeq_epi16(_mm_and_si128(utf16_packed1, v_f800), v_d800))); if (big_endian) { const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - utf16_packed = _mm_shuffle_epi8(utf16_packed, swap); + utf16_packed0 = _mm_shuffle_epi8(utf16_packed0, swap); + utf16_packed1 = _mm_shuffle_epi8(utf16_packed1, swap); } - _mm_storeu_si128((__m128i *)utf16_output, utf16_packed); - utf16_output += 8; - buf += 8; + _mm_storeu_si128((__m128i *)utf16_output + 0, utf16_packed0); + _mm_storeu_si128((__m128i *)utf16_output + 1, utf16_packed1); + utf16_output += 16; + buf += 16; } else { - size_t forward = 7; - size_t k = 0; - if (size_t(end - buf) < forward + 1) { - forward = size_t(end - buf - 1); - } - for (; k < forward; k++) { - uint32_t word = buf[k]; - if ((word & 0xFFFF0000) == 0) { - // will not generate a surrogate pair - if (word >= 0xD800 && word <= 0xDFFF) { - return std::make_pair(nullptr, utf16_output); - } - *utf16_output++ = - big_endian - ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) - : char16_t(word); - } else { - // will generate a surrogate pair - if (word > 0x10FFFF) { - return std::make_pair(nullptr, utf16_output); - } - word -= 0x10000; - uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10)); - uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF)); - if (big_endian) { - high_surrogate = - uint16_t((high_surrogate >> 8) | (high_surrogate << 8)); - low_surrogate = - uint16_t((low_surrogate >> 8) | (low_surrogate << 8)); - } - *utf16_output++ = char16_t(high_surrogate); - *utf16_output++ = char16_t(low_surrogate); - } + if (!validate_utf32(in0, in1) || !validate_utf32(in2, in3)) { + return std::make_pair(nullptr, utf16_output); } - buf += k; + + const auto ret0 = sse_expand_surrogate(in0); + _mm_storeu_si128((__m128i *)utf16_output, ret0.compressed); + utf16_output += ret0.u16count; + + const auto ret1 = sse_expand_surrogate(in1); + _mm_storeu_si128((__m128i *)utf16_output, ret1.compressed); + utf16_output += ret1.u16count; + + const auto ret2 = sse_expand_surrogate(in2); + _mm_storeu_si128((__m128i *)utf16_output, ret2.compressed); + utf16_output += ret2.u16count; + + const auto ret3 = sse_expand_surrogate(in3); + _mm_storeu_si128((__m128i *)utf16_output, ret3.compressed); + utf16_output += ret3.u16count; + + buf += 16; } } @@ -50253,7 +51389,8 @@ namespace utf32 { template T min(T a, T b) { return a <= b ? a : b; } -size_t utf8_length_from_utf32(const char32_t *input, size_t length) { +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { using vector_u32 = simd32; const char32_t *start = input; @@ -50264,16 +51401,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const size_t N = vector_u32::ELEMENTS; - const auto one = vector_u32::splat(1); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else const auto v_ffffff80 = vector_u32::splat(0xffffff80); const auto v_fffff800 = vector_u32::splat(0xfffff800); const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP size_t counter = 0; // 1. vectorized loop unrolled 4 times { - // we use uint32 counters, this is + // we use vector of uint32 counters, this is why this limit is used const size_t max_iterations = std::numeric_limits::max() / (max_increment * 4); size_t blocks = length / (N * 4); @@ -50289,6 +51432,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { const auto in2 = vector_u32(input + 2 * N); const auto in3 = vector_u32(input + 3 * N); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else acc += min(one, in0 & v_ffffff80); acc += min(one, in1 & v_ffffff80); acc += min(one, in2 & v_ffffff80); @@ -50303,6 +51462,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { acc += min(one, in1 & v_ffff0000); acc += min(one, in2 & v_ffff0000); acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += 4 * N; } @@ -50325,9 +51485,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) { for (size_t i = 0; i < iterations; i++) { const auto in = vector_u32(input); +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else acc += min(one, in & v_ffffff80); acc += min(one, in & v_fffff800); acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP input += N; } @@ -50524,7 +51690,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (!match_system(big_endian)) { input = input.swap_bytes(); } - // 0xd800 .. 0xdbff - low surrogate // 0xdc00 .. 0xdfff - high surrogate const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); @@ -50567,7 +51732,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (iteration == 0) { count += v_count.sum(); v_count = vector_u16::zero(); - iteration = max_iterations; } } @@ -52729,6 +53893,7 @@ SIMDUTF_UNTARGET_REGION /* begin file src/simdutf/lsx/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "lsx" // #define SIMDUTF_IMPLEMENTATION lsx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 /* end file src/simdutf/lsx/begin.h */ namespace simdutf { namespace lsx { @@ -52764,10 +53929,7 @@ const uint8_t lsx_1_2_utf8_bytes_mask[] = { #if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) { - // const v16u8 shuf = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - // return __lsx_vshuf_b(__lsx_vldi(0), vec, shuf); return __lsx_vshuf4i_b(vec, 0b10110001); - // return __lsx_vor_v(__lsx_vslli_h(vec, 8), __lsx_vsrli_h(vec, 8)); } #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32 @@ -52839,7 +54001,7 @@ convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) { __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits // 1 byte: 00000000 00000000 // 2 byte: 00000aaa aa000000 - const __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f + const __m128i v1f00 = lsx_splat_u16(0x1f00); __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits // Combine with a shift right accumulate // 1 byte: 00000000 0bbbbbbb @@ -52869,15 +54031,14 @@ simd8 utf16_gather_high_bytes(const simd16 in0, #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING #if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING /* begin file src/lsx/lsx_validate_utf32le.cpp */ - const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) { const char32_t *end = input + size; - __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000)); - __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff)); - __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/ - __m128i currentmax = __lsx_vldi(0x0); - __m128i currentoffsetmax = __lsx_vldi(0x0); + __m128i offset = lsx_splat_u32(0xffff2000); + __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff); + __m128i standardmax = lsx_splat_u32(0x10ffff); + __m128i currentmax = lsx_splat_u32(0); + __m128i currentoffsetmax = lsx_splat_u32(0); while (input + 4 < end) { __m128i in = __lsx_vld(reinterpret_cast(input), 0); @@ -52909,11 +54070,11 @@ const result lsx_validate_utf32le_with_errors(const char32_t *input, const char32_t *start = input; const char32_t *end = input + size; - __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000)); - __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff)); - __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/ - __m128i currentmax = __lsx_vldi(0x0); - __m128i currentoffsetmax = __lsx_vldi(0x0); + __m128i offset = lsx_splat_u32(0xffff2000); + __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff); + __m128i standardmax = lsx_splat_u32(0x10ffff); + __m128i currentmax = lsx_splat_u32(0); + __m128i currentoffsetmax = lsx_splat_u32(0); while (input + 4 < end) { __m128i in = __lsx_vld(reinterpret_cast(input), 0); @@ -52975,7 +54136,7 @@ lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len, // t0 = [0000|00aa|bbbb|bb00] __m128i t0 = __lsx_vslli_h(in16, 2); // t1 = [0000|00aa|0000|0000] - __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785)); + __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x300)); // t3 = [0000|00aa|00bb|bbbb] __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f)); // t4 = [1100|00aa|10bb|bbbb] @@ -53191,7 +54352,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, // 1 byte: 00000000 00000000 // 2 byte: xx0bbbbb 00000000 // 3 byte: xxbbbbbb 00000000 - __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/); + __m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00)); // 1 byte: 00000000 0ccccccc // 2 byte: 0010bbbb bbcccccc // 3 byte: 0010bbbb bbcccccc @@ -53243,10 +54404,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, // = +0xDC00|0xE7C0 __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0)); // Generate unadjusted trail surrogate minus lowest 2 bits - // vec(0000FF00) = __lsx_vldi(-1758) // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 - __m128i trail = - __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/)); + __m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000ff00)); // Insert low 2 bits of trail surrogate to magic number for later // 11011100 00000000 11100111 110000cc __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic); @@ -53259,10 +54418,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, __lsx_vrepli_h(0x3f /* 0x003f*/)); // Blend pairs - // __lsx_vldi(-1741) => vec(0x0000FFFF) // 000000cc ccdddddd|11110aaa bbbbbb00 - __m128i blend = - __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */); + __m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF)); // Add magic number to finish the result // 110111CC CCDDDDDD|110110AA BBBBBBCC @@ -53301,13 +54458,12 @@ size_t convert_masked_utf8_to_utf16(const char *input, // first. __m128i middlehigh = __lsx_vslli_w(perm, 2); // 00000000 00000000 00cccccc 00000000 - __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */); + __m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00)); // Start assembling the sequence. Since the 4th byte is in the same position // as it would be in a surrogate and there is no dependency, shift left // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte: // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx - __m128i ab = - __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/); + __m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000)); // Top 16 bits contains the high ten bits of the surrogate pair before // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction @@ -53321,8 +54477,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, // After this is for surrogates // Blend the low and high surrogates // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd - __m128i mixed = - __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/); + __m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF)); // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte: // 11110aaa bbbbbbcc|000000cc ccdddddd @@ -53476,14 +54631,13 @@ size_t convert_masked_utf8_to_utf32(const char *input, // The top bits will be corrected later in the bsl // 00000000 10bbbbbb 00000000 __m128i middle = - __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits + __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits // Combine low and middle with shift right accumulate // 00000000 00xxbbbb bbcccccc __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2)); // Insert top 4 bits from high byte with bitwise select // 00000000 aaaabbbb bbcccccc - __m128i composed = - __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/)); + __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000)); __lsx_vst(composed, utf32_output, 0); utf32_output += 4; // We wrote 4 32-bit characters. return consumed; @@ -53512,10 +54666,10 @@ size_t convert_masked_utf8_to_utf32(const char *input, __m128i merge2 = __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */ __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */ - __lsx_vldi(-2545)); /*0x00000FFF*/ + lsx_splat_u32(0x00000FFF)); // Clear the garbage // 00000000 000aaabb bbbbcccc ccdddddd - __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/)); + __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF)); // Store __lsx_vst(composed, utf32_output, 0); utf32_output += 3; // We wrote 3 32-bit characters. @@ -53535,11 +54689,11 @@ size_t convert_masked_utf8_to_utf32(const char *input, // Ascii __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); - __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/)); + __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00)); // 00000000 00000000 0000cccc ccdddddd __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii); - __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/)); + __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000)); __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1)); // Insert twice // 00000000 000aaabb bbbbxxxx xxxxxxxx @@ -53549,8 +54703,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f)); ab = __lsx_vsrli_w(ab, 4); // 00000000 000aaabb bbbbcccc ccdddddd - __m128i composed = - __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/)); + __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF)); // Store __lsx_vst(composed, utf32_output, 0); utf32_output += 3; // We wrote 3 32-bit characters. @@ -53820,7 +54973,7 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { // t0 = [000a|aaaa|bbbb|bb00] __m128i t0 = __lsx_vslli_h(in, 2); // t1 = [000a|aaaa|0000|0000] - __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/)); + __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -53846,9 +54999,8 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { utf8_output += row[0]; continue; } - __m128i surrogates_bytemask = - __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)), - __lsx_vldi(-2600 /*0xD800*/)); + __m128i surrogates_bytemask = __lsx_vseq_h( + __lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800)); // It might seem like checking for surrogates_bitmask == 0xc000 could help. // However, it is likely an uncommon occurrence. if (__lsx_bz_v(surrogates_bytemask)) { @@ -53888,14 +55040,14 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); __m128i t1 = __lsx_vand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/)); + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m128i s0 = __lsx_vsrli_h(in, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m128i s1 = __lsx_vslli_h(in, 2); // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] - s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/)); + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00)); // [00bb|bbbb|0000|aaaa] __m128i s2 = __lsx_vor_v(s0, s1); @@ -53903,8 +55055,8 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); __m128i s3 = __lsx_vor_v(s2, v_c0e0); __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff); - __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask, - __lsx_vldi(-2752 /*0x4000*/)); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -54059,7 +55211,7 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, // t0 = [000a|aaaa|bbbb|bb00] __m128i t0 = __lsx_vslli_h(in, 2); // t1 = [000a|aaaa|0000|0000] - __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/)); + __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -54085,9 +55237,8 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, utf8_output += row[0]; continue; } - __m128i surrogates_bytemask = - __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)), - __lsx_vldi(-2600 /*0xD800*/)); + __m128i surrogates_bytemask = __lsx_vseq_h( + __lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800)); // It might seem like checking for surrogates_bitmask == 0xc000 could help. // However, it is likely an uncommon occurrence. if (__lsx_bz_v(surrogates_bytemask)) { @@ -54127,14 +55278,14 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); __m128i t1 = __lsx_vand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688)); + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m128i s0 = __lsx_vsrli_h(in, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m128i s1 = __lsx_vslli_h(in, 2); // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] - s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/)); + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00)); // [00bb|bbbb|0000|aaaa] __m128i s2 = __lsx_vor_v(s0, s1); @@ -54142,8 +55293,8 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); __m128i s3 = __lsx_vor_v(s2, v_c0e0); __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff); - __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask, - __lsx_vldi(-2752 /*0x4000*/)); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -54249,8 +55400,8 @@ lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len, const char16_t *end = buf + len; __m128i zero = __lsx_vldi(0); - __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/ - __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ + __m128i v_f800 = lsx_splat_u16(0xf800); + __m128i v_d800 = lsx_splat_u16(0xd800); while (end - buf >= 8) { __m128i in = __lsx_vld(reinterpret_cast(buf), 0); @@ -54322,8 +55473,8 @@ lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, const char16_t *end = buf + len; __m128i zero = __lsx_vldi(0); - __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/ - __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ + __m128i v_f800 = lsx_splat_u16(0xf800); + __m128i v_d800 = lsx_splat_u16(0xd800); while (end - buf >= 8) { __m128i in = __lsx_vld(reinterpret_cast(buf), 0); @@ -54458,10 +55609,10 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { uint8_t *utf8_output = reinterpret_cast(utf8_out); const char32_t *end = buf + len; - __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080)); - __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF)); - __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF)); - __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ + __m128i v_c080 = lsx_splat_u16(0xc080); + __m128i v_07ff = lsx_splat_u16(0x07ff); + __m128i v_dfff = lsx_splat_u16(0xdfff); + __m128i v_d800 = lsx_splat_u16(0xd800); __m128i forbidden_bytemask = __lsx_vldi(0x0); const size_t safety_margin = @@ -54499,7 +55650,7 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { // t0 = [000a|aaaa|bbbb|bb00] const __m128i t0 = __lsx_vslli_h(utf16_packed, 2); // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/)); + const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -54568,23 +55719,22 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); __m128i t1 = __lsx_vand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/)); + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m128i s0 = __lsx_vsrli_h(utf16_packed, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m128i s1 = __lsx_vslli_h(utf16_packed, 2); // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/)); + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00)); // [00bb|bbbb|0000|aaaa] __m128i s2 = __lsx_vor_v(s0, s1); // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0)); __m128i s3 = __lsx_vor_v(s2, v_c0e0); - // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF); __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff); - __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask, - __lsx_vldi(-2752 /*0x4000*/)); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -54678,6 +55828,7 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { if (__lsx_bnz_v(forbidden_bytemask)) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + return std::make_pair(buf, reinterpret_cast(utf8_output)); } @@ -54688,10 +55839,10 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, const char32_t *start = buf; const char32_t *end = buf + len; - __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080)); - __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF)); - __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF)); - __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ + __m128i v_c080 = lsx_splat_u16(0xc080); + __m128i v_07ff = lsx_splat_u16(0x07ff); + __m128i v_dfff = lsx_splat_u16(0xdfff); + __m128i v_d800 = lsx_splat_u16(0xd800); __m128i forbidden_bytemask = __lsx_vldi(0x0); const size_t safety_margin = 12; // to avoid overruns, see issue @@ -54728,7 +55879,7 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, // t0 = [000a|aaaa|bbbb|bb00] const __m128i t0 = __lsx_vslli_h(utf16_packed, 2); // t1 = [000a|aaaa|0000|0000] - const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/)); + const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -54801,14 +55952,14 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F)); __m128i t1 = __lsx_vand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/)); + __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m128i s0 = __lsx_vsrli_h(utf16_packed, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m128i s1 = __lsx_vslli_h(utf16_packed, 2); // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/)); + s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00)); // [00bb|bbbb|0000|aaaa] __m128i s2 = __lsx_vor_v(s0, s1); // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] @@ -54816,8 +55967,8 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, __m128i s3 = __lsx_vor_v(s2, v_c0e0); // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF); __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff); - __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask, - __lsx_vldi(-2752 /*0x4000*/)); + __m128i m0 = + __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000)); __m128i s4 = __lsx_vxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -54924,8 +56075,8 @@ lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len, const char32_t *end = buf + len; __m128i forbidden_bytemask = __lsx_vrepli_h(0); - __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ - __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff)); + __m128i v_d800 = lsx_splat_u16(0xd800); + __m128i v_dfff = lsx_splat_u16(0xdfff); while (end - buf >= 8) { __m128i in0 = __lsx_vld(reinterpret_cast(buf), 0); __m128i in1 = __lsx_vld(reinterpret_cast(buf), 16); @@ -55000,8 +56151,8 @@ lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, const char32_t *end = buf + len; __m128i forbidden_bytemask = __lsx_vrepli_h(0); - __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/ - __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff)); + __m128i v_d800 = lsx_splat_u16(0xd800); + __m128i v_dfff = lsx_splat_u16(0xdfff); while (end - buf >= 8) { __m128i in0 = __lsx_vld(reinterpret_cast(buf), 0); @@ -55444,9 +56595,8 @@ static inline void load_block(block64 *b, const char16_t *src) { static inline void base64_decode(char *out, __m128i str) { __m128i t0 = __lsx_vor_v( __lsx_vslli_w(str, 26), - __lsx_vslli_w(__lsx_vand_v(str, __lsx_vldi(-1758 /*0x0000FF00*/)), 12)); - __m128i t1 = - __lsx_vsrli_w(__lsx_vand_v(str, __lsx_vldi(-3521 /*0x003F0000*/)), 2); + __lsx_vslli_w(__lsx_vand_v(str, lsx_splat_u32(0x0000FF00)), 12)); + __m128i t1 = __lsx_vsrli_w(__lsx_vand_v(str, lsx_splat_u32(0x003F0000)), 2); __m128i t2 = __lsx_vor_v(t0, t1); __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16)); const v16u8 pack_shuffle = {3, 2, 1, 7, 6, 5, 11, 10, @@ -57511,7 +58661,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (!match_system(big_endian)) { input = input.swap_bytes(); } - // 0xd800 .. 0xdbff - low surrogate // 0xdc00 .. 0xdfff - high surrogate const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); @@ -57554,7 +58703,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (iteration == 0) { count += v_count.sum(); v_count = vector_u16::zero(); - iteration = max_iterations; } } @@ -57732,6 +58880,147 @@ const result validate_utf16_with_errors(const char16_t *input, size_t size) { /* end file src/generic/validate_utf16.h */ #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf32.h */ +#include + +namespace simdutf { +namespace lsx { +namespace { +namespace utf32 { + +template T min(T a, T b) { return a <= b ? a : b; } + +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += N; + } + + counter += acc.sum(); + } + } + + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } + + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} + +} // namespace utf32 +} // unnamed namespace +} // namespace lsx +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 + // // Implementation-specific overrides // @@ -58678,39 +59967,14 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8( #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 simdutf_warn_unused size_t implementation::utf8_length_from_utf32( const char32_t *input, size_t length) const noexcept { - const __m128i v_80 = __lsx_vrepli_w(0x80); /*0x00000080*/ - const __m128i v_800 = __lsx_vldi(-3832); /*0x00000800*/ - const __m128i v_10000 = __lsx_vldi(-3583); /*0x00010000*/ - size_t pos = 0; - size_t count = 0; - for (; pos + 4 <= length; pos += 4) { - __m128i in = __lsx_vld(reinterpret_cast(input + pos), 0); - const __m128i ascii_bytes_bytemask = __lsx_vslt_w(in, v_80); - const __m128i one_two_bytes_bytemask = __lsx_vslt_w(in, v_800); - const __m128i two_bytes_bytemask = - __lsx_vxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask); - const __m128i three_bytes_bytemask = - __lsx_vxor_v(__lsx_vslt_w(in, v_10000), one_two_bytes_bytemask); - - const uint32_t ascii_bytes_count = __lsx_vpickve2gr_bu( - __lsx_vpcnt_b(__lsx_vmskltz_w(ascii_bytes_bytemask)), 0); - const uint32_t two_bytes_count = __lsx_vpickve2gr_bu( - __lsx_vpcnt_b(__lsx_vmskltz_w(two_bytes_bytemask)), 0); - const uint32_t three_bytes_count = __lsx_vpickve2gr_bu( - __lsx_vpcnt_b(__lsx_vmskltz_w(three_bytes_bytemask)), 0); - - count += - 16 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count; - } - return count + - scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); + return utf32::utf8_length_from_utf32(input, length); } #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 simdutf_warn_unused size_t implementation::utf16_length_from_utf32( const char32_t *input, size_t length) const noexcept { - const __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/ + const __m128i v_ffff = lsx_splat_u32(0x0000ffff); size_t pos = 0; size_t count = 0; for (; pos + 4 <= length; pos += 4) { @@ -58835,6 +60099,7 @@ size_t implementation::binary_to_base64(const char *input, size_t length, } // namespace simdutf /* begin file src/simdutf/lsx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP /* end file src/simdutf/lsx/end.h */ /* end file src/lsx/implementation.cpp */ #endif @@ -58843,6 +60108,7 @@ size_t implementation::binary_to_base64(const char *input, size_t length, /* begin file src/simdutf/lasx/begin.h */ // redefining SIMDUTF_IMPLEMENTATION to "lasx" // #define SIMDUTF_IMPLEMENTATION lasx +#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1 /* end file src/simdutf/lasx/begin.h */ namespace simdutf { namespace lasx { @@ -58953,7 +60219,7 @@ convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) { __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits // 1 byte: 00000000 00000000 // 2 byte: 00000aaa aa000000 - __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f + __m128i v1f00 = lsx_splat_u16(0x1f00); __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits // Combine with a shift right accumulate // 1 byte: 00000000 0bbbbbbb @@ -58983,7 +60249,6 @@ simd8 utf16_gather_high_bytes(const simd16 in0, #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING #if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING /* begin file src/lasx/lasx_validate_utf32le.cpp */ - const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) { const char32_t *end = input + size; @@ -58995,9 +60260,9 @@ const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) { } } - __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000)); - __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff)); - __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/ + __m256i offset = lasx_splat_u32(0xffff2000); + __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff); + __m256i standardmax = lasx_splat_u32(0x10ffff); __m256i currentmax = __lasx_xvldi(0x0); __m256i currentoffsetmax = __lasx_xvldi(0x0); @@ -59040,9 +60305,9 @@ const result lasx_validate_utf32le_with_errors(const char32_t *input, input++; } - __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000)); - __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff)); - __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/ + __m256i offset = lasx_splat_u32(0xffff2000); + __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff); + __m256i standardmax = lasx_splat_u32(0x10ffff); __m256i currentmax = __lasx_xvldi(0x0); __m256i currentoffsetmax = __lasx_xvldi(0x0); @@ -59083,11 +60348,11 @@ lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len, char *utf8_out) { uint8_t *utf8_output = reinterpret_cast(utf8_out); const size_t safety_margin = 12; - const char *end = latin1_input + len - safety_margin; + const char *end = latin1_input + len; // We always write 16 bytes, of which more than the first 8 bytes // are valid. A safety margin of 8 is more than sufficient. - while (end - latin1_input >= 16) { + while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) { __m128i in8 = __lsx_vld(reinterpret_cast(latin1_input), 0); uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0); if (ascii_mask == 0xFFFF) { @@ -59105,7 +60370,7 @@ lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len, // t0 = [0000|00aa|bbbb|bb00] __m256i t0 = __lasx_xvslli_h(in16, 2); // t1 = [0000|00aa|0000|0000] - __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785)); + __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x300)); // t3 = [0000|00aa|00bb|bbbb] __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f)); // t4 = [1100|00aa|10bb|bbbb] @@ -59166,7 +60431,7 @@ lasx_convert_latin1_to_utf16le(const char *buf, size_t len, buf += 32; } - if (buf + 16 <= end) { + if (end - buf >= 16) { __m128i zero = __lsx_vldi(0); __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); @@ -59191,7 +60456,7 @@ lasx_convert_latin1_to_utf16be(const char *buf, size_t len, } __m256i zero = __lasx_xvldi(0); - while (buf + 32 <= end) { + while (end - buf >= 32) { __m256i in8 = __lasx_xvld(reinterpret_cast(buf), 0); __m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000); @@ -59204,7 +60469,7 @@ lasx_convert_latin1_to_utf16be(const char *buf, size_t len, buf += 32; } - if (buf + 16 <= end) { + if (end - buf >= 16) { __m128i zero_128 = __lsx_vldi(0); __m128i in8 = __lsx_vld(reinterpret_cast(buf), 0); @@ -59402,7 +60667,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, // 1 byte: 00000000 00000000 // 2 byte: xx0bbbbb 00000000 // 3 byte: xxbbbbbb 00000000 - __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/); + __m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00)); // 1 byte: 00000000 0ccccccc // 2 byte: 0010bbbb bbcccccc // 3 byte: 0010bbbb bbcccccc @@ -59454,10 +60719,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, // = +0xDC00|0xE7C0 __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0)); // Generate unadjusted trail surrogate minus lowest 2 bits - // vec(0000FF00) = __lsx_vldi(-1758) // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00 - __m128i trail = - __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/)); + __m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000FF00)); // Insert low 2 bits of trail surrogate to magic number for later // 11011100 00000000 11100111 110000cc __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic); @@ -59470,10 +60733,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, __lsx_vrepli_h(0x3f /* 0x003f*/)); // Blend pairs - // __lsx_vldi(-1741) => vec(0x0000FFFF) // 000000cc ccdddddd|11110aaa bbbbbb00 - __m128i blend = - __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */); + __m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF)); // Add magic number to finish the result // 110111CC CCDDDDDD|110110AA BBBBBBCC @@ -59510,13 +60771,12 @@ size_t convert_masked_utf8_to_utf16(const char *input, // first. __m128i middlehigh = __lsx_vslli_w(perm, 2); // 00000000 00000000 00cccccc 00000000 - __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */); + __m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00)); // Start assembling the sequence. Since the 4th byte is in the same position // as it would be in a surrogate and there is no dependency, shift left // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte: // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx - __m128i ab = - __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/); + __m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000)); // Top 16 bits contains the high ten bits of the surrogate pair before // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction @@ -59530,8 +60790,7 @@ size_t convert_masked_utf8_to_utf16(const char *input, // After this is for surrogates // Blend the low and high surrogates // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd - __m128i mixed = - __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/); + __m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF)); // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte: // 11110aaa bbbbbbcc|000000cc ccdddddd @@ -59696,14 +60955,13 @@ size_t convert_masked_utf8_to_utf32(const char *input, // The top bits will be corrected later in the bsl // 00000000 10bbbbbb 00000000 __m128i middle = - __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits + __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits // Combine low and middle with shift right accumulate // 00000000 00xxbbbb bbcccccc __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2)); // Insert top 4 bits from high byte with bitwise select // 00000000 aaaabbbb bbcccccc - __m128i composed = - __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/)); + __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000)); __lsx_vst(composed, utf32_output, 0); utf32_output += 4; // We wrote 4 32-bit characters. return consumed; @@ -59732,10 +60990,10 @@ size_t convert_masked_utf8_to_utf32(const char *input, __m128i merge2 = __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */ __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */ - __lsx_vldi(-2545)); /*0x00000FFF*/ + lsx_splat_u32(0x00000FFF)); // Clear the garbage // 00000000 000aaabb bbbbcccc ccdddddd - __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/)); + __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF)); // Store __lsx_vst(composed, utf32_output, 0); utf32_output += 3; // We wrote 3 32-bit characters. @@ -59755,11 +61013,11 @@ size_t convert_masked_utf8_to_utf32(const char *input, // Ascii __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); - __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/)); + __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00)); // 00000000 00000000 0000cccc ccdddddd __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii); - __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/)); + __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000)); __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1)); // Insert twice // 00000000 000aaabb bbbbxxxx xxxxxxxx @@ -59769,8 +61027,7 @@ size_t convert_masked_utf8_to_utf32(const char *input, __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f)); ab = __lsx_vsrli_w(ab, 4); // 00000000 000aaabb bbbbcccc ccdddddd - __m128i composed = - __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/)); + __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF)); // Store __lsx_vst(composed, utf32_output, 0); utf32_output += 3; // We wrote 3 32-bit characters. @@ -60021,7 +61278,7 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { // t0 = [000a|aaaa|bbbb|bb00] __m256i t0 = __lasx_xvslli_h(in, 2); // t1 = [000a|aaaa|0000|0000] - __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/)); + __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -60059,9 +61316,8 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { buf += 16; continue; } - __m256i surrogates_bytemask = - __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)), - __lasx_xvldi(-2600 /*0xD800*/)); + __m256i surrogates_bytemask = __lasx_xvseq_h( + __lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800)); // It might seem like checking for surrogates_bitmask == 0xc000 could help. // However, it is likely an uncommon occurrence. if (__lasx_xbz_v(surrogates_bytemask)) { @@ -60101,14 +61357,14 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); __m256i t1 = __lasx_xvand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688)); + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m256i s0 = __lasx_xvsrli_h(in, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m256i s1 = __lasx_xvslli_h(in, 2); // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] - s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/)); + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00)); // [00bb|bbbb|0000|aaaa] __m256i s2 = __lasx_xvor_v(s0, s1); @@ -60116,8 +61372,8 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) { __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0)); __m256i s3 = __lasx_xvor_v(s2, v_c0e0); __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff); - __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask, - __lasx_xvldi(-2752 /*0x4000*/)); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); __m256i s4 = __lasx_xvxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -60276,7 +61532,7 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, // t0 = [000a|aaaa|bbbb|bb00] __m256i t0 = __lasx_xvslli_h(in, 2); // t1 = [000a|aaaa|0000|0000] - __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/)); + __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -60314,9 +61570,8 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, buf += 16; continue; } - __m256i surrogates_bytemask = - __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)), - __lasx_xvldi(-2600 /*0xD800*/)); + __m256i surrogates_bytemask = __lasx_xvseq_h( + __lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800)); // It might seem like checking for surrogates_bitmask == 0xc000 could help. // However, it is likely an uncommon occurrence. if (__lasx_xbz_v(surrogates_bytemask)) { @@ -60356,14 +61611,14 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); __m256i t1 = __lasx_xvand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688)); + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m256i s0 = __lasx_xvsrli_h(in, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m256i s1 = __lasx_xvslli_h(in, 2); // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000] - s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/)); + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00)); // [00bb|bbbb|0000|aaaa] __m256i s2 = __lasx_xvor_v(s0, s1); @@ -60371,8 +61626,8 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len, __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0)); __m256i s3 = __lasx_xvor_v(s2, v_c0e0); __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff); - __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask, - __lasx_xvldi(-2752 /*0x4000*/)); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); __m256i s4 = __lasx_xvxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -60524,8 +61779,8 @@ lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len, } } - __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/ - __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/ + __m256i v_f800 = lasx_splat_u16(0xf800); + __m256i v_d800 = lasx_splat_u16(0xd800); while (end - buf >= 16) { __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); @@ -60623,8 +61878,8 @@ lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len, } } - __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/ - __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/ + __m256i v_f800 = lasx_splat_u16(0xf800); + __m256i v_d800 = lasx_splat_u16(0xd800); while (end - buf >= 16) { __m256i in = __lasx_xvld(reinterpret_cast(buf), 0); if (!match_system(big_endian)) { @@ -60795,10 +62050,10 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { buf++; } - __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080)); - __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF)); - __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF)); - __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/ + __m256i v_c080 = lasx_splat_u16(0xc080); + __m256i v_07ff = lasx_splat_u16(0x07ff); + __m256i v_dfff = lasx_splat_u16(0xdfff); + __m256i v_d800 = lasx_splat_u16(0xd800); __m256i zero = __lasx_xvldi(0); __m128i zero_128 = __lsx_vldi(0); __m256i forbidden_bytemask = __lasx_xvldi(0x0); @@ -60840,7 +62095,7 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { // t0 = [000a|aaaa|bbbb|bb00] const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2); // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/)); + const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -60920,14 +62175,14 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); __m256i t1 = __lasx_xvand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/)); + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m256i s1 = __lasx_xvslli_h(utf16_packed, 2); // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/)); + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00)); // [00bb|bbbb|0000|aaaa] __m256i s2 = __lasx_xvor_v(s0, s1); // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] @@ -60936,8 +62191,8 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) { // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF); __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(utf16_packed, v_07ff); - __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask, - __lasx_xvldi(-2752 /*0x4000*/)); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); __m256i s4 = __lasx_xvxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -61091,10 +62346,10 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, buf++; } - __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080)); - __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF)); - __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF)); - __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/ + __m256i v_c080 = lasx_splat_u16(0xc080); + __m256i v_07ff = lasx_splat_u16(0x07ff); + __m256i v_dfff = lasx_splat_u16(0xdfff); + __m256i v_d800 = lasx_splat_u16(0xd800); __m256i zero = __lasx_xvldi(0); __m128i zero_128 = __lsx_vldi(0); __m256i forbidden_bytemask = __lasx_xvldi(0x0); @@ -61135,7 +62390,7 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, // t0 = [000a|aaaa|bbbb|bb00] const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2); // t1 = [000a|aaaa|0000|0000] - const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/)); + const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00)); // t2 = [0000|0000|00bb|bbbb] const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f)); // t3 = [000a|aaaa|00bb|bbbb] @@ -61219,14 +62474,14 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F)); __m256i t1 = __lasx_xvand_v(t0, v_3f7f); // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] - __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/)); + __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000)); // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12); // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] __m256i s1 = __lasx_xvslli_h(utf16_packed, 2); // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] - s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/)); + s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3F00)); // [00bb|bbbb|0000|aaaa] __m256i s2 = __lasx_xvor_v(s0, s1); // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] @@ -61235,8 +62490,8 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF); __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(utf16_packed, v_07ff); - __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask, - __lasx_xvldi(-2752 /*0x4000*/)); + __m256i m0 = + __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000)); __m256i s4 = __lasx_xvxor_v(s3, m0); // 4. expand code units 16-bit => 32-bit @@ -61396,8 +62651,8 @@ lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len, } __m256i forbidden_bytemask = __lasx_xvrepli_h(0); - __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/ - __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff)); + __m256i v_d800 = lasx_splat_u16(0xd800); + __m256i v_dfff = lasx_splat_u16(0xdfff); while (end - buf >= 16) { __m256i in0 = __lasx_xvld(reinterpret_cast(buf), 0); __m256i in1 = __lasx_xvld(reinterpret_cast(buf), 32); @@ -61503,8 +62758,8 @@ lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len, } __m256i forbidden_bytemask = __lasx_xvrepli_h(0); - __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/ - __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff)); + __m256i v_d800 = lasx_splat_u16(0xd800); + __m256i v_dfff = lasx_splat_u16(0xdfff); while (end - buf >= 16) { __m256i in0 = __lasx_xvld(reinterpret_cast(buf), 0); __m256i in1 = __lasx_xvld(reinterpret_cast(buf), 32); @@ -61945,10 +63200,9 @@ static inline void load_block(block64 *b, const char16_t *src) { static inline void base64_decode(char *out, __m256i str) { __m256i t0 = __lasx_xvor_v( __lasx_xvslli_w(str, 26), - __lasx_xvslli_w(__lasx_xvand_v(str, __lasx_xvldi(-1758 /*0x0000FF00*/)), - 12)); - __m256i t1 = __lasx_xvsrli_w( - __lasx_xvand_v(str, __lasx_xvldi(-3521 /*0x003F0000*/)), 2); + __lasx_xvslli_w(__lasx_xvand_v(str, lasx_splat_u32(0x0000ff00)), 12)); + __m256i t1 = + __lasx_xvsrli_w(__lasx_xvand_v(str, lasx_splat_u32(0x003f0000)), 2); __m256i t2 = __lasx_xvor_v(t0, t1); __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16)); __m256i pack_shuffle = ____m256i( @@ -64028,7 +65282,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (!match_system(big_endian)) { input = input.swap_bytes(); } - // 0xd800 .. 0xdbff - low surrogate // 0xdc00 .. 0xdfff - high surrogate const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800)); @@ -64071,7 +65324,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in, if (iteration == 0) { count += v_count.sum(); v_count = vector_u16::zero(); - iteration = max_iterations; } } @@ -64249,6 +65501,147 @@ const result validate_utf16_with_errors(const char16_t *input, size_t size) { /* end file src/generic/validate_utf16.h */ #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING +#if SIMDUTF_FEATURE_UTF32 +/* begin file src/generic/utf32.h */ +#include + +namespace simdutf { +namespace lasx { +namespace { +namespace utf32 { + +template T min(T a, T b) { return a <= b ? a : b; } + +simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input, + size_t length) { + using vector_u32 = simd32; + + const char32_t *start = input; + + // we add up to three ones in a single iteration (see the vectorized loop in + // section #2 below) + const size_t max_increment = 3; + + const size_t N = vector_u32::ELEMENTS; + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + const auto v_0000007f = vector_u32::splat(0x0000007f); + const auto v_000007ff = vector_u32::splat(0x000007ff); + const auto v_0000ffff = vector_u32::splat(0x0000ffff); +#else + const auto v_ffffff80 = vector_u32::splat(0xffffff80); + const auto v_fffff800 = vector_u32::splat(0xfffff800); + const auto v_ffff0000 = vector_u32::splat(0xffff0000); + const auto one = vector_u32::splat(1); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + size_t counter = 0; + + // 1. vectorized loop unrolled 4 times + { + // we use vector of uint32 counters, this is why this limit is used + const size_t max_iterations = + std::numeric_limits::max() / (max_increment * 4); + size_t blocks = length / (N * 4); + length -= blocks * (N * 4); + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + simd32 acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in0 = vector_u32(input + 0 * N); + const auto in1 = vector_u32(input + 1 * N); + const auto in2 = vector_u32(input + 2 * N); + const auto in3 = vector_u32(input + 3 * N); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in0 > v_0000007f); + acc -= as_vector_u32(in1 > v_0000007f); + acc -= as_vector_u32(in2 > v_0000007f); + acc -= as_vector_u32(in3 > v_0000007f); + + acc -= as_vector_u32(in0 > v_000007ff); + acc -= as_vector_u32(in1 > v_000007ff); + acc -= as_vector_u32(in2 > v_000007ff); + acc -= as_vector_u32(in3 > v_000007ff); + + acc -= as_vector_u32(in0 > v_0000ffff); + acc -= as_vector_u32(in1 > v_0000ffff); + acc -= as_vector_u32(in2 > v_0000ffff); + acc -= as_vector_u32(in3 > v_0000ffff); +#else + acc += min(one, in0 & v_ffffff80); + acc += min(one, in1 & v_ffffff80); + acc += min(one, in2 & v_ffffff80); + acc += min(one, in3 & v_ffffff80); + + acc += min(one, in0 & v_fffff800); + acc += min(one, in1 & v_fffff800); + acc += min(one, in2 & v_fffff800); + acc += min(one, in3 & v_fffff800); + + acc += min(one, in0 & v_ffff0000); + acc += min(one, in1 & v_ffff0000); + acc += min(one, in2 & v_ffff0000); + acc += min(one, in3 & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += 4 * N; + } + + counter += acc.sum(); + } + } + + // 2. vectorized loop for tail + { + const size_t max_iterations = + std::numeric_limits::max() / max_increment; + size_t blocks = length / N; + length -= blocks * N; + while (blocks != 0) { + const size_t iterations = min(blocks, max_iterations); + blocks -= iterations; + + auto acc = vector_u32::zero(); + for (size_t i = 0; i < iterations; i++) { + const auto in = vector_u32(input); + +#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP + acc -= as_vector_u32(in > v_0000007f); + acc -= as_vector_u32(in > v_000007ff); + acc -= as_vector_u32(in > v_0000ffff); +#else + acc += min(one, in & v_ffffff80); + acc += min(one, in & v_fffff800); + acc += min(one, in & v_ffff0000); +#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP + + input += N; + } + + counter += acc.sum(); + } + } + + const size_t consumed = input - start; + if (consumed != 0) { + // We don't count 0th bytes in the vectorized loops above, this + // is why we need to count them in the end. + counter += consumed; + } + + return counter + scalar::utf32::utf8_length_from_utf32(input, length); +} + +} // namespace utf32 +} // unnamed namespace +} // namespace lasx +} // namespace simdutf +/* end file src/generic/utf32.h */ +#endif // SIMDUTF_FEATURE_UTF32 + // // Implementation-specific overrides // @@ -65308,45 +66701,14 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8( #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 simdutf_warn_unused size_t implementation::utf8_length_from_utf32( const char32_t *input, size_t length) const noexcept { - __m256i v_80 = __lasx_xvrepli_w(0x80); /*0x00000080*/ - __m256i v_800 = __lasx_xvldi(-3832); /*0x00000800*/ - __m256i v_10000 = __lasx_xvldi(-3583); /*0x00010000*/ - size_t pos = 0; - size_t count = 0; - for (; pos + 8 <= length; pos += 8) { - __m256i in = - __lasx_xvld(reinterpret_cast(input + pos), 0); - __m256i ascii_bytes_bytemask = __lasx_xvslt_w(in, v_80); - __m256i one_two_bytes_bytemask = __lasx_xvslt_w(in, v_800); - __m256i two_bytes_bytemask = - __lasx_xvxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask); - __m256i three_bytes_bytemask = - __lasx_xvxor_v(__lasx_xvslt_w(in, v_10000), one_two_bytes_bytemask); - - __m256i ascii_bytes = - __lasx_xvpcnt_w(__lasx_xvmskltz_w(ascii_bytes_bytemask)); - const uint32_t ascii_bytes_count = __lasx_xvpickve2gr_wu(ascii_bytes, 0) + - __lasx_xvpickve2gr_wu(ascii_bytes, 4); - __m256i two_bytes = __lasx_xvpcnt_w(__lasx_xvmskltz_w(two_bytes_bytemask)); - const uint32_t two_bytes_count = __lasx_xvpickve2gr_wu(two_bytes, 0) + - __lasx_xvpickve2gr_wu(two_bytes, 4); - __m256i three_bytes = - __lasx_xvpcnt_w(__lasx_xvmskltz_w(three_bytes_bytemask)); - const uint32_t three_bytes_count = __lasx_xvpickve2gr_wu(three_bytes, 0) + - __lasx_xvpickve2gr_wu(three_bytes, 4); - - count += - 32 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count; - } - return count + - scalar::utf32::utf8_length_from_utf32(input + pos, length - pos); + return utf32::utf8_length_from_utf32(input, length); } #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32 simdutf_warn_unused size_t implementation::utf16_length_from_utf32( const char32_t *input, size_t length) const noexcept { - __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/ + __m128i v_ffff = lsx_splat_u32(0x0000ffff); size_t pos = 0; size_t count = 0; for (; pos + 4 <= length; pos += 4) { @@ -65471,6 +66833,7 @@ size_t implementation::binary_to_base64(const char *input, size_t length, } // namespace simdutf /* begin file src/simdutf/lasx/end.h */ +#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP /* end file src/simdutf/lasx/end.h */ /* end file src/lasx/implementation.cpp */ #endif diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h index cd92c8ea04955d..02ee47d388e129 100644 --- a/deps/simdutf/simdutf.h +++ b/deps/simdutf/simdutf.h @@ -1,4 +1,4 @@ -/* auto-generated on 2025-03-17 16:12:36 -0400. Do not edit! */ +/* auto-generated on 2025-04-03 18:47:20 -0400. Do not edit! */ /* begin file include/simdutf.h */ #ifndef SIMDUTF_H #define SIMDUTF_H @@ -652,7 +652,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "6.4.0" +#define SIMDUTF_VERSION "6.4.2" namespace simdutf { enum { @@ -667,7 +667,7 @@ enum { /** * The revision (major.minor.REVISION) of simdutf being used. */ - SIMDUTF_VERSION_REVISION = 0 + SIMDUTF_VERSION_REVISION = 2 }; } // namespace simdutf @@ -3966,7 +3966,7 @@ atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input, * characters) must be divisible by four. * * You should call this function with a buffer that is at least - * maximal_binary_length_from_utf6_base64(input, length) bytes long. If you fail + * maximal_binary_length_from_base64(input, length) bytes long. If you fail * to provide that much space, the function may cause a buffer overflow. * * Advanced users may want to taylor how the last chunk is handled. By default, @@ -5715,7 +5715,7 @@ class implementation { * character that is not a valid base64 character (INVALID_BASE64_CHARACTER). * * You should call this function with a buffer that is at least - * maximal_binary_length_from_utf6_base64(input, length) bytes long. If you + * maximal_binary_length_from_base64(input, length) bytes long. If you * fail to provide that much space, the function may cause a buffer overflow. * * @param input the base64 string to process, in ASCII stored as