diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp
index 2332f2bea4eddc..8991fbe7b57877 100644
--- a/deps/simdutf/simdutf.cpp
+++ b/deps/simdutf/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2025-03-17 16:12:36 -0400. Do not edit! */
+/* auto-generated on 2025-04-03 18:47:20 -0400. Do not edit! */
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
 
@@ -2378,6 +2378,61 @@ const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
 
 #endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
 /* end file src/tables/utf16_to_utf8_tables.h */
+/* begin file src/tables/utf32_to_utf16_tables.h */
+// file generated by scripts/sse_convert_utf32_to_utf16.py
+#ifndef SIMDUTF_UTF32_TO_UTF16_TABLES_H
+#define SIMDUTF_UTF32_TO_UTF16_TABLES_H
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf32_to_utf16 {
+
+const uint8_t pack_utf32_to_utf16le[16][16] = {
+    {0, 1, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80},
+    {0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80},
+    {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80},
+    {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+};
+
+const uint8_t pack_utf32_to_utf16be[16][16] = {
+    {1, 0, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80},
+    {1, 0, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80},
+    {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+    {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
+};
+
+} // namespace utf32_to_utf16
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
+/* end file src/tables/utf32_to_utf16_tables.h */
 // End of tables.
 
 // Implementations: they need to be setup before including
@@ -2685,6 +2740,7 @@ class implementation final : public simdutf::implementation {
 /* begin file src/simdutf/arm64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
 // #define SIMDUTF_IMPLEMENTATION arm64
+#define SIMDUTF_SIMD_HAS_BYTEMASK 1
 /* end file src/simdutf/arm64/begin.h */
 
   // Declarations
@@ -2808,7 +2864,15 @@ template <typename T> struct simd8;
 template <typename T, typename Mask = simd8<bool>> struct base_u8 {
   uint8x16_t value;
   static const int SIZE = sizeof(value);
-
+  void dump() const {
+    uint8_t temp[16];
+    vst1q_u8(temp, *this);
+    printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x,%04x, %04x, %04x, "
+           "%04x, %04x, %04x, %04x, %04x]\n",
+           temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6],
+           temp[7], temp[8], temp[9], temp[10], temp[11], temp[12], temp[13],
+           temp[14], temp[15]);
+  }
   // Conversion from/to SIMD register
   simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
   simdutf_really_inline operator const uint8x16_t &() const {
@@ -3054,6 +3118,7 @@ template <> struct simd8<uint8_t> : base_u8<uint8_t> {
   template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
     return vshlq_n_u8(*this, N);
   }
+  simdutf_really_inline uint16_t sum_bytes() const { return vaddvq_u8(*this); }
 
   // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
   // for out of range values)
@@ -3084,6 +3149,7 @@ template <> struct simd8<uint8_t> : base_u8<uint8_t> {
 // Signed bytes
 template <> struct simd8<int8_t> {
   int8x16_t value;
+  static const int SIZE = sizeof(value);
 
   static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
     return vmovq_n_s8(_value);
@@ -3454,8 +3520,10 @@ template <typename T> struct simd16;
 
 template <typename T, typename Mask = simd16<bool>> struct base_u16 {
   uint16x8_t value;
+  /// the size of vector in bytes
   static const int SIZE = sizeof(value);
-
+  /// the number of elements of type T a vector can hold
+  static const int ELEMENTS = SIZE / sizeof(T);
   // Conversion from/to SIMD register
   simdutf_really_inline base_u16() = default;
   simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
@@ -3515,7 +3583,12 @@ struct base16 : base_u16<T> {
   simdutf_really_inline base16(const Pointer *ptr) : base16(vld1q_u16(ptr)) {}
 
   static const int SIZE = sizeof(base_u16<T>::value);
-
+  void dump() const {
+    uint16_t temp[8];
+    vst1q_u16(temp, *this);
+    printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0],
+           temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]);
+  }
   template <int N = 1>
   simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
     return vextq_u18(prev_chunk, *this, 8 - N);
@@ -3558,10 +3631,10 @@ template <typename T> struct base16_numeric : base16<T> {
 
   // Addition/subtraction are the same for signed and unsigned
   simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
-    return vaddq_u8(*this, other);
+    return vaddq_u16(*this, other);
   }
   simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
-    return vsubq_u8(*this, other);
+    return vsubq_u16(*this, other);
   }
   simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
     *this = *this + other;
@@ -3723,6 +3796,14 @@ template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
   simdutf_really_inline simd16<uint16_t> swap_bytes() const {
     return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this)));
   }
+  void dump() const {
+    uint16_t temp[8];
+    vst1q_u16(temp, *this);
+    printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0],
+           temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]);
+  }
+
+  simdutf_really_inline uint32_t sum() const { return vaddlvq_u16(value); }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
   return this->value;
@@ -3857,7 +3938,180 @@ simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
                                          (this->chunks[3] < mask_low)));
   return x.to_bitmask();
 }
+
+simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
+                                           simd16<uint16_t> b) {
+  return vminq_u16(a.value, b.value);
+}
 /* end file src/simdutf/arm64/simd16-inl.h */
+/* begin file src/simdutf/arm64/simd32-inl.h */
+template <typename T> struct simd32;
+
+template <> struct simd32<uint32_t> {
+  static const size_t SIZE = sizeof(uint32x4_t);
+  static const size_t ELEMENTS = SIZE / sizeof(uint32_t);
+
+  uint32x4_t value;
+
+  simdutf_really_inline simd32(const uint32x4_t v) : value(v) {}
+
+  template <typename Pointer>
+  simdutf_really_inline simd32(const Pointer *ptr)
+      : value(vld1q_u32(reinterpret_cast<const uint32_t *>(ptr))) {}
+
+  simdutf_really_inline uint64_t sum() const { return vaddvq_u32(value); }
+
+  simdutf_really_inline simd32<uint32_t> swap_bytes() const {
+    return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(value)));
+  }
+
+  template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
+    return vshrq_n_u32(value, N);
+  }
+
+  template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
+    return vshlq_n_u32(value, N);
+  }
+
+  void dump() const {
+    uint32_t temp[4];
+    vst1q_u32(temp, value);
+    printf("[%08x, %08x, %08x, %08x]\n", temp[0], temp[1], temp[2], temp[3]);
+  }
+
+  // operators
+  simdutf_really_inline simd32 &operator+=(const simd32 other) {
+    value = vaddq_u32(value, other.value);
+    return *this;
+  }
+
+  // static members
+  simdutf_really_inline static simd32<uint32_t> zero() {
+    return vdupq_n_u32(0);
+  }
+
+  simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
+    return vdupq_n_u32(v);
+  }
+};
+
+//----------------------------------------------------------------------
+
+template <> struct simd32<bool> {
+  uint32x4_t value;
+
+  simdutf_really_inline simd32(const uint32x4_t v) : value(v) {}
+
+  simdutf_really_inline bool any() const { return vmaxvq_u32(value) != 0; }
+};
+
+//----------------------------------------------------------------------
+
+template <typename T>
+simdutf_really_inline simd32<T> operator|(const simd32<T> a,
+                                          const simd32<T> b) {
+  return vorrq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> a,
+                                           const simd32<uint32_t> b) {
+  return vminq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
+                                           const simd32<uint32_t> b) {
+  return vmaxq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
+                                              uint32_t b) {
+  return vceqq_u32(a.value, vdupq_n_u32(b));
+}
+
+simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
+                                                 const simd32<uint32_t> b) {
+  return vandq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
+                                                 uint32_t b) {
+  return vandq_u32(a.value, vdupq_n_u32(b));
+}
+
+simdutf_really_inline simd32<uint32_t> operator|(const simd32<uint32_t> a,
+                                                 uint32_t b) {
+  return vorrq_u32(a.value, vdupq_n_u32(b));
+}
+
+simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
+                                                 const simd32<uint32_t> b) {
+  return vaddq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<uint32_t> operator-(const simd32<uint32_t> a,
+                                                 uint32_t b) {
+  return vsubq_u32(a.value, vdupq_n_u32(b));
+}
+
+simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
+                                              const simd32<uint32_t> b) {
+  return vcgeq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
+  return vmvnq_u32(v.value);
+}
+
+simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
+                                             const simd32<uint32_t> b) {
+  return vcgtq_u32(a.value, b.value);
+}
+
+simdutf_really_inline simd32<uint32_t> select(const simd32<bool> cond,
+                                              const simd32<uint32_t> v_true,
+                                              const simd32<uint32_t> v_false) {
+  return vbslq_u32(cond.value, v_true.value, v_false.value);
+}
+/* end file src/simdutf/arm64/simd32-inl.h */
+/* begin file src/simdutf/arm64/simd64-inl.h */
+template <typename T> struct simd64;
+
+template <> struct simd64<uint64_t> {
+  uint64x2_t value;
+
+  simdutf_really_inline simd64(const uint64x2_t v) : value(v) {}
+
+  template <typename Pointer>
+  simdutf_really_inline simd64(const Pointer *ptr)
+      : value(vld1q_u64(reinterpret_cast<const uint64_t *>(ptr))) {}
+
+  simdutf_really_inline uint64_t sum() const { return vaddvq_u64(value); }
+
+  // operators
+  simdutf_really_inline simd64 &operator+=(const simd64 other) {
+    value = vaddq_u64(value, other.value);
+    return *this;
+  }
+
+  // static members
+  simdutf_really_inline static simd64<uint64_t> zero() {
+    return vdupq_n_u64(0);
+  }
+
+  simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
+    return vdupq_n_u64(v);
+  }
+};
+/* end file src/simdutf/arm64/simd64-inl.h */
+
+simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
+  // We do it as 3 instructions. There might be a faster way.
+  // We hope that these 3 instructions are cheap.
+  uint16x8_t first_sum = vpaddlq_u8(v);
+  uint32x4_t second_sum = vpaddlq_u16(first_sum);
+  return vpaddlq_u32(second_sum);
+}
+
 } // namespace simd
 } // unnamed namespace
 } // namespace arm64
@@ -4442,6 +4696,98 @@ namespace icelake {
 namespace {
 namespace simd {
 
+/* begin file src/simdutf/icelake/simd16-inl.h */
+template <typename T> struct simd16;
+
+template <> struct simd16<uint16_t> {
+  static const size_t SIZE = sizeof(__m512i);
+  static const size_t ELEMENTS = SIZE / sizeof(uint16_t);
+
+  template <typename Pointer>
+  static simdutf_really_inline simd16<uint16_t> load(const Pointer *ptr) {
+    return simd16<uint16_t>(ptr);
+  }
+
+  __m512i value;
+
+  simdutf_really_inline simd16(const __m512i v) : value(v) {}
+
+  template <typename Pointer>
+  simdutf_really_inline simd16(const Pointer *ptr)
+      : value(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr))) {}
+
+  // operators
+  simdutf_really_inline simd16 &operator+=(const simd16 other) {
+    value = _mm512_add_epi32(value, other.value);
+    return *this;
+  }
+
+  simdutf_really_inline simd16 &operator-=(const simd16 other) {
+    value = _mm512_sub_epi32(value, other.value);
+    return *this;
+  }
+
+  // methods
+  simdutf_really_inline simd16 swap_bytes() const {
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+        0x0607040502030001, 0x0e0f0c0d0a0b0809);
+
+    return _mm512_shuffle_epi8(value, byteflip);
+  }
+
+  simdutf_really_inline uint64_t sum() const {
+    const auto lo = _mm512_and_si512(value, _mm512_set1_epi32(0xffff));
+    const auto hi = _mm512_srli_epi32(value, 16);
+    const auto sum32 = _mm512_add_epi32(lo, hi);
+
+    return _mm512_reduce_add_epi32(sum32);
+  }
+
+  // static members
+  simdutf_really_inline static simd16<uint16_t> zero() {
+    return _mm512_setzero_si512();
+  }
+
+  simdutf_really_inline static simd16<uint16_t> splat(uint16_t v) {
+    return _mm512_set1_epi16(v);
+  }
+};
+
+template <> struct simd16<bool> {
+  __mmask32 value;
+
+  simdutf_really_inline simd16(const __mmask32 v) : value(v) {}
+};
+
+// ------------------------------------------------------------
+
+simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> b,
+                                           const simd16<uint16_t> a) {
+  return _mm512_min_epu16(a.value, b.value);
+}
+
+simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> a,
+                                                 uint16_t b) {
+  return _mm512_and_si512(a.value, _mm512_set1_epi16(b));
+}
+
+simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
+                                                 uint16_t b) {
+  return _mm512_xor_si512(a.value, _mm512_set1_epi16(b));
+}
+
+simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
+                                                 const simd16<uint16_t> b) {
+  return _mm512_xor_si512(a.value, b.value);
+}
+
+simdutf_really_inline simd16<bool> operator==(const simd16<uint16_t> a,
+                                              uint16_t b) {
+  return _mm512_cmpeq_epi16_mask(a.value, _mm512_set1_epi16(b));
+}
+/* end file src/simdutf/icelake/simd16-inl.h */
 /* begin file src/simdutf/icelake/simd32-inl.h */
 template <typename T> struct simd32;
 
@@ -4939,7 +5285,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* begin file src/simdutf/haswell/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
 // #define SIMDUTF_IMPLEMENTATION haswell
-#define SIMDUTF_SIMD_HAS_BYTEMASK
+#define SIMDUTF_SIMD_HAS_BYTEMASK 1
 
 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
 // nothing needed.
@@ -6436,7 +6782,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* begin file src/simdutf/westmere/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
 // #define SIMDUTF_IMPLEMENTATION westmere
-#define SIMDUTF_SIMD_HAS_BYTEMASK
+#define SIMDUTF_SIMD_HAS_BYTEMASK 1
 
 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
@@ -6834,7 +7180,6 @@ template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
 
   simdutf_really_inline uint64_t sum_bytes() const {
     const auto tmp = _mm_sad_epu8(value, _mm_setzero_si128());
-
     return _mm_extract_epi64(tmp, 0) + _mm_extract_epi64(tmp, 1);
   }
 };
@@ -7398,6 +7743,14 @@ template <> struct simd32<uint32_t> {
     return _mm_shuffle_epi8(value, shuffle);
   }
 
+  template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
+    return _mm_srli_epi32(value, N);
+  }
+
+  template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
+    return _mm_slli_epi32(value, N);
+  }
+
   void dump() const {
     printf("[%08x, %08x, %08x, %08x]\n", uint32_t(_mm_extract_epi32(value, 0)),
            uint32_t(_mm_extract_epi32(value, 1)),
@@ -7434,6 +7787,10 @@ template <> struct simd32<bool> {
   simdutf_really_inline bool any() const {
     return _mm_movemask_epi8(value) != 0;
   }
+
+  simdutf_really_inline uint8_t to_4bit_bitmask() const {
+    return uint8_t(_mm_movemask_ps(_mm_castsi128_ps(value)));
+  }
 };
 
 //----------------------------------------------------------------------
@@ -7454,16 +7811,36 @@ simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
   return _mm_max_epu32(a.value, b.value);
 }
 
+simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
+                                              uint32_t b) {
+  return _mm_cmpeq_epi32(a.value, _mm_set1_epi32(b));
+}
+
 simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                  const simd32<uint32_t> b) {
   return _mm_and_si128(a.value, b.value);
 }
 
+simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
+                                                 uint32_t b) {
+  return _mm_and_si128(a.value, _mm_set1_epi32(b));
+}
+
+simdutf_really_inline simd32<uint32_t> operator|(const simd32<uint32_t> a,
+                                                 uint32_t b) {
+  return _mm_or_si128(a.value, _mm_set1_epi32(b));
+}
+
 simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
                                                  const simd32<uint32_t> b) {
   return _mm_add_epi32(a.value, b.value);
 }
 
+simdutf_really_inline simd32<uint32_t> operator-(const simd32<uint32_t> a,
+                                                 uint32_t b) {
+  return _mm_sub_epi32(a.value, _mm_set1_epi32(b));
+}
+
 simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
                                               const simd32<uint32_t> b) {
   return _mm_cmpeq_epi32(_mm_max_epu32(a.value, b.value), a.value);
@@ -7477,6 +7854,12 @@ simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
                                              const simd32<uint32_t> b) {
   return !(b >= a);
 }
+
+simdutf_really_inline simd32<uint32_t> select(const simd32<bool> cond,
+                                              const simd32<uint32_t> v_true,
+                                              const simd32<uint32_t> v_false) {
+  return _mm_blendv_epi8(v_false.value, v_true.value, cond.value);
+}
 /* end file src/simdutf/westmere/simd32-inl.h */
 /* begin file src/simdutf/westmere/simd64-inl.h */
 template <typename T> struct simd64;
@@ -7518,6 +7901,10 @@ simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
   return _mm_sad_epu8(v.value, simd8<uint8_t>::zero());
 }
 
+simdutf_really_inline simd8<uint8_t> as_vector_u8(const simd32<uint32_t> v) {
+  return simd8<uint8_t>(v.value);
+}
+
 } // namespace simd
 } // unnamed namespace
 } // namespace westmere
@@ -10053,6 +10440,7 @@ class implementation final : public simdutf::implementation {
 /* begin file src/simdutf/lsx/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "lsx"
 // #define SIMDUTF_IMPLEMENTATION lsx
+#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
 /* end file src/simdutf/lsx/begin.h */
 
   // Declarations
@@ -10065,6 +10453,190 @@ class implementation final : public simdutf::implementation {
 // you use visual studio or other compilers.
 #include <lsxintrin.h>
 
+/*
+Encoding of argument for LoongArch64 xvldi instruction.  See:
+https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm
+
+1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
+
+2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
+
+3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
+
+4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
+
+5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
+
+6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
+
+7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all
+lanes
+
+8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to
+all lanes
+
+9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
+
+10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast
+the result as 64-bit elements to all lanes
+*/
+
+namespace vldi {
+
+template <uint16_t v> class const_u16 {
+  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
+  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
+
+  constexpr static bool is_case5 = uint16_t(b0) == v;
+  constexpr static bool is_case6 = (uint16_t(b1) << 8) == v;
+  constexpr static bool is_case9 = (b0 == b1);
+  constexpr static bool is_case10 =
+      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00));
+
+public:
+  constexpr static uint16_t operation = is_case5    ? 0b10100
+                                        : is_case6  ? 0b10101
+                                        : is_case9  ? 0b11000
+                                        : is_case10 ? 0x11001
+                                                    : 0xffff;
+
+  constexpr static uint16_t byte =
+      is_case5    ? b0
+      : is_case6  ? b1
+      : is_case9  ? b0
+      : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00))
+                  : 0xffff;
+
+  constexpr static int value = int((operation << 8) | byte) - 8192;
+  constexpr static bool valid = operation != 0xffff;
+};
+
+template <uint32_t v> class const_u32 {
+  constexpr static const uint8_t b0 = (v & 0xff);
+  constexpr static const uint8_t b1 = ((v >> 8) & 0xff);
+  constexpr static const uint8_t b2 = ((v >> 16) & 0xff);
+  constexpr static const uint8_t b3 = ((v >> 24) & 0xff);
+
+  constexpr static bool is_case1 = (uint32_t(b0) == v);
+  constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v);
+  constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v);
+  constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v);
+  constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0);
+  constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0);
+  constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff);
+  constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff);
+  constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3);
+  constexpr static bool is_case10 =
+      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
+      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00));
+
+public:
+  constexpr static uint16_t operation = is_case1    ? 0b10000
+                                        : is_case2  ? 0b10001
+                                        : is_case3  ? 0b10010
+                                        : is_case4  ? 0b10011
+                                        : is_case5  ? 0b10100
+                                        : is_case6  ? 0b10101
+                                        : is_case7  ? 0b10110
+                                        : is_case8  ? 0b10111
+                                        : is_case9  ? 0b11000
+                                        : is_case10 ? 0b11001
+                                                    : 0xffff;
+
+  constexpr static uint16_t byte =
+      is_case1    ? b0
+      : is_case2  ? b1
+      : is_case3  ? b2
+      : is_case4  ? b3
+      : is_case5  ? b0
+      : is_case6  ? b1
+      : is_case7  ? b1
+      : is_case8  ? b2
+      : is_case9  ? b0
+      : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) |
+                     (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00))
+                  : 0xffff;
+
+  constexpr static int value = int((operation << 8) | byte) - 8192;
+  constexpr static bool valid = operation != 0xffff;
+};
+
+template <uint64_t v> class const_u64 {
+  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
+  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
+  constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff);
+  constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff);
+  constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff);
+  constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff);
+  constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff);
+  constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff);
+
+  constexpr static bool is_case10 =
+      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
+      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) &&
+      ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) &&
+      ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00));
+
+public:
+  constexpr static bool is_32bit =
+      ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value;
+  constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation;
+  constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte;
+
+  constexpr static uint16_t operation = is_32bit    ? op_32bit
+                                        : is_case10 ? 0x11001
+                                                    : 0xffff;
+
+  constexpr static uint16_t byte =
+      is_32bit ? byte_32bit
+      : is_case10
+          ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) |
+             (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) |
+             (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00))
+          : 0xffff;
+
+  constexpr static int value = int((operation << 8) | byte) - 8192;
+  constexpr static bool valid = operation != 0xffff;
+};
+} // namespace vldi
+
+// Uncomment when running under QEMU affected
+// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865
+// Versions <= 9.2.2 are affected, likely anything newer is correct.
+#ifndef QEMU_VLDI_BUG
+// #define QEMU_VLDI_BUG 1
+#endif
+
+#ifdef QEMU_VLDI_BUG
+  #define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v)
+  #define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v)
+#else
+template <uint16_t x> constexpr __m128i lsx_splat_u16_aux() {
+  constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512);
+  constexpr uint16_t imm10 = is_imm10 ? x : 0;
+  constexpr bool is_vldi = vldi::const_u16<x>::valid;
+  constexpr int vldi_imm = is_vldi ? vldi::const_u16<x>::value : 0;
+
+  return is_imm10  ? __lsx_vrepli_h(int16_t(imm10))
+         : is_vldi ? __lsx_vldi(vldi_imm)
+                   : __lsx_vreplgr2vr_h(x);
+}
+
+template <uint32_t x> constexpr __m128i lsx_splat_u32_aux() {
+  constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512);
+  constexpr uint32_t imm10 = is_imm10 ? x : 0;
+  constexpr bool is_vldi = vldi::const_u32<x>::valid;
+  constexpr int vldi_imm = is_vldi ? vldi::const_u32<x>::value : 0;
+
+  return is_imm10  ? __lsx_vrepli_w(int32_t(imm10))
+         : is_vldi ? __lsx_vldi(vldi_imm)
+                   : __lsx_vreplgr2vr_w(x);
+}
+
+  #define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>()
+  #define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>()
+#endif // QEMU_VLDI_BUG
+
 #endif //  SIMDUTF_LSX_INTRINSICS_H
 /* end file src/simdutf/lsx/intrinsics.h */
 /* begin file src/simdutf/lsx/bitmanipulation.h */
@@ -10685,6 +11257,7 @@ template <typename T> struct simd8x64 {
         .to_bitmask();
   }
 }; // struct simd8x64<T>
+
 /* begin file src/simdutf/lsx/simd16-inl.h */
 template <typename T> struct simd16;
 
@@ -11065,6 +11638,78 @@ simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
   return x.to_bitmask();
 }
 /* end file src/simdutf/lsx/simd16-inl.h */
+/* begin file src/simdutf/lsx/simd32-inl.h */
+template <typename T> struct simd32;
+
+template <> struct simd32<uint32_t> {
+  __m128i value;
+  static const int SIZE = sizeof(value);
+  static const int ELEMENTS = SIZE / sizeof(uint32_t);
+
+  // constructors
+  simdutf_really_inline simd32(__m128i v) : value(v) {}
+
+  template <typename Ptr>
+  simdutf_really_inline simd32(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {}
+
+  // in-place operators
+  simdutf_really_inline simd32 &operator-=(const simd32 other) {
+    value = __lsx_vsub_w(value, other.value);
+    return *this;
+  }
+
+  // members
+  simdutf_really_inline uint64_t sum() const {
+    return uint64_t(__lsx_vpickve2gr_wu(value, 0)) +
+           uint64_t(__lsx_vpickve2gr_wu(value, 1)) +
+           uint64_t(__lsx_vpickve2gr_wu(value, 2)) +
+           uint64_t(__lsx_vpickve2gr_wu(value, 3));
+  }
+
+  // static members
+  static simdutf_really_inline simd32<uint32_t> splat(uint32_t x) {
+    return __lsx_vreplgr2vr_w(x);
+  }
+
+  static simdutf_really_inline simd32<uint32_t> zero() {
+    return __lsx_vrepli_w(0);
+  }
+};
+
+// ------------------------------------------------------------
+
+template <> struct simd32<bool> {
+  __m128i value;
+  static const int SIZE = sizeof(value);
+
+  // constructors
+  simdutf_really_inline simd32(__m128i v) : value(v) {}
+};
+
+// ------------------------------------------------------------
+
+simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
+                                                 const simd32<uint32_t> b) {
+  return __lsx_vor_v(a.value, b.value);
+}
+
+simdutf_really_inline simd32<bool> operator<(const simd32<uint32_t> a,
+                                             const simd32<uint32_t> b) {
+  return __lsx_vslt_wu(a.value, b.value);
+}
+
+simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
+                                             const simd32<uint32_t> b) {
+  return __lsx_vslt_wu(b.value, a.value);
+}
+
+// ------------------------------------------------------------
+
+simdutf_really_inline simd32<uint32_t> as_vector_u32(const simd32<bool> v) {
+  return v.value;
+}
+/* end file src/simdutf/lsx/simd32-inl.h */
+
 } // namespace simd
 } // unnamed namespace
 } // namespace lsx
@@ -11074,6 +11719,7 @@ simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
 /* end file src/simdutf/lsx/simd.h */
 
 /* begin file src/simdutf/lsx/end.h */
+#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 /* end file src/simdutf/lsx/end.h */
 
 #endif // SIMDUTF_IMPLEMENTATION_LSX
@@ -11382,6 +12028,7 @@ class implementation final : public simdutf::implementation {
 /* begin file src/simdutf/lasx/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "lasx"
 // #define SIMDUTF_IMPLEMENTATION lasx
+#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
 /* end file src/simdutf/lasx/begin.h */
 
   // Declarations
@@ -11485,6 +12132,191 @@ static inline __m128i lasx_extracti128_hi(__m256i in) {
 }
 #endif
 
+/*
+Encoding of argument for LoongArch64 xvldi instruction.  See:
+https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm
+
+1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
+
+2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
+
+3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
+
+4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
+
+5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
+
+6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
+
+7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all
+lanes
+
+8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to
+all lanes
+
+9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
+
+10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast
+the result as 64-bit elements to all lanes
+*/
+
+namespace lasx_vldi {
+
+template <uint16_t v> class const_u16 {
+  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
+  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
+
+  constexpr static bool is_case5 = uint16_t(b0) == v;
+  constexpr static bool is_case6 = (uint16_t(b1) << 8) == v;
+  constexpr static bool is_case9 = (b0 == b1);
+  constexpr static bool is_case10 =
+      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00));
+
+public:
+  constexpr static uint16_t operation = is_case5    ? 0b10100
+                                        : is_case6  ? 0b10101
+                                        : is_case9  ? 0b11000
+                                        : is_case10 ? 0x11001
+                                                    : 0xffff;
+
+  constexpr static uint16_t byte =
+      is_case5    ? b0
+      : is_case6  ? b1
+      : is_case9  ? b0
+      : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00))
+                  : 0xffff;
+
+  constexpr static int value = int((operation << 8) | byte) - 8192;
+  constexpr static bool valid = operation != 0xffff;
+};
+
+template <uint32_t v> class const_u32 {
+  constexpr static const uint8_t b0 = (v & 0xff);
+  constexpr static const uint8_t b1 = ((v >> 8) & 0xff);
+  constexpr static const uint8_t b2 = ((v >> 16) & 0xff);
+  constexpr static const uint8_t b3 = ((v >> 24) & 0xff);
+
+  constexpr static bool is_case1 = (uint32_t(b0) == v);
+  constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v);
+  constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v);
+  constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v);
+  constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0);
+  constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0);
+  constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff);
+  constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff);
+  constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3);
+  constexpr static bool is_case10 =
+      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
+      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00));
+
+public:
+  constexpr static uint16_t operation = is_case1    ? 0b10000
+                                        : is_case2  ? 0b10001
+                                        : is_case3  ? 0b10010
+                                        : is_case4  ? 0b10011
+                                        : is_case5  ? 0b10100
+                                        : is_case6  ? 0b10101
+                                        : is_case7  ? 0b10110
+                                        : is_case8  ? 0b10111
+                                        : is_case9  ? 0b11000
+                                        : is_case10 ? 0b11001
+                                                    : 0xffff;
+
+  constexpr static uint16_t byte =
+      is_case1    ? b0
+      : is_case2  ? b1
+      : is_case3  ? b2
+      : is_case4  ? b3
+      : is_case5  ? b0
+      : is_case6  ? b1
+      : is_case7  ? b1
+      : is_case8  ? b2
+      : is_case9  ? b0
+      : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) |
+                     (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00))
+                  : 0xffff;
+
+  constexpr static int value = int((operation << 8) | byte) - 8192;
+  constexpr static bool valid = operation != 0xffff;
+};
+
+template <uint64_t v> class const_u64 {
+  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
+  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
+  constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff);
+  constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff);
+  constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff);
+  constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff);
+  constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff);
+  constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff);
+
+  constexpr static bool is_case10 =
+      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
+      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) &&
+      ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) &&
+      ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00));
+
+public:
+  constexpr static bool is_32bit =
+      ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value;
+  constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation;
+  constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte;
+
+  constexpr static uint16_t operation = is_32bit    ? op_32bit
+                                        : is_case10 ? 0x11001
+                                                    : 0xffff;
+
+  constexpr static uint16_t byte =
+      is_32bit ? byte_32bit
+      : is_case10
+          ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) |
+             (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) |
+             (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00))
+          : 0xffff;
+
+  constexpr static int value = int((operation << 8) | byte) - 8192;
+  constexpr static bool valid = operation != 0xffff;
+};
+
+} // namespace lasx_vldi
+
+// Uncomment when running under QEMU affected
+// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865
+// Versions <= 9.2.2 are affected, likely anything newer is correct.
+#ifndef QEMU_VLDI_BUG
+// #define QEMU_VLDI_BUG 1
+#endif
+
+#ifdef QEMU_VLDI_BUG
+  #define lasx_splat_u16(v) __lasx_xvreplgr2vr_h(v)
+  #define lasx_splat_u32(v) __lasx_xvreplgr2vr_w(v)
+#else
+template <uint16_t x> constexpr __m256i lasx_splat_u16_aux() {
+  constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512);
+  constexpr uint16_t imm10 = is_imm10 ? x : 0;
+  constexpr bool is_vldi = lasx_vldi::const_u16<x>::valid;
+  constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u16<x>::value : 0;
+
+  return is_imm10  ? __lasx_xvrepli_h(int16_t(imm10))
+         : is_vldi ? __lasx_xvldi(vldi_imm)
+                   : __lasx_xvreplgr2vr_h(x);
+}
+
+template <uint32_t x> constexpr __m256i lasx_splat_u32_aux() {
+  constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512);
+  constexpr uint32_t imm10 = is_imm10 ? x : 0;
+  constexpr bool is_vldi = lasx_vldi::const_u32<x>::valid;
+  constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u32<x>::value : 0;
+
+  return is_imm10  ? __lasx_xvrepli_w(int32_t(imm10))
+         : is_vldi ? __lasx_xvldi(vldi_imm)
+                   : __lasx_xvreplgr2vr_w(x);
+}
+
+  #define lasx_splat_u16(v) lasx_splat_u16_aux<(v)>()
+  #define lasx_splat_u32(v) lasx_splat_u32_aux<(v)>()
+#endif // QEMU_VLDI_BUG
+
 #endif //  SIMDUTF_LASX_INTRINSICS_H
 /* end file src/simdutf/lasx/intrinsics.h */
 /* begin file src/simdutf/lasx/bitmanipulation.h */
@@ -12569,6 +13401,83 @@ template <typename T> struct simd16x32 {
   }
 }; // struct simd16x32<T>
 /* end file src/simdutf/lasx/simd16-inl.h */
+/* begin file src/simdutf/lasx/simd32-inl.h */
+template <typename T> struct simd32;
+
+template <> struct simd32<uint32_t> {
+  __m256i value;
+  static const int SIZE = sizeof(value);
+  static const int ELEMENTS = SIZE / sizeof(uint32_t);
+
+  // constructors
+  simdutf_really_inline simd32(__m256i v) : value(v) {}
+
+  template <typename Ptr>
+  simdutf_really_inline simd32(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {}
+
+  // in-place operators
+  simdutf_really_inline simd32 &operator-=(const simd32 other) {
+    value = __lasx_xvsub_w(value, other.value);
+    return *this;
+  }
+
+  // members
+  simdutf_really_inline uint64_t sum() const {
+    const auto odd = __lasx_xvsrli_d(value, 32);
+    const auto even = __lasx_xvand_v(value, __lasx_xvreplgr2vr_d(0xffffffff));
+
+    const auto sum64 = __lasx_xvadd_d(odd, even);
+
+    return uint64_t(__lasx_xvpickve2gr_du(sum64, 0)) +
+           uint64_t(__lasx_xvpickve2gr_du(sum64, 1)) +
+           uint64_t(__lasx_xvpickve2gr_du(sum64, 2)) +
+           uint64_t(__lasx_xvpickve2gr_du(sum64, 3));
+  }
+
+  // static members
+  static simdutf_really_inline simd32<uint32_t> splat(uint32_t x) {
+    return __lasx_xvreplgr2vr_w(x);
+  }
+
+  static simdutf_really_inline simd32<uint32_t> zero() {
+    return __lasx_xvrepli_w(0);
+  }
+};
+
+// ------------------------------------------------------------
+
+template <> struct simd32<bool> {
+  __m256i value;
+  static const int SIZE = sizeof(value);
+
+  // constructors
+  simdutf_really_inline simd32(__m256i v) : value(v) {}
+};
+
+// ------------------------------------------------------------
+
+simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
+                                                 const simd32<uint32_t> b) {
+  return __lasx_xvor_v(a.value, b.value);
+}
+
+simdutf_really_inline simd32<bool> operator<(const simd32<uint32_t> a,
+                                             const simd32<uint32_t> b) {
+  return __lasx_xvslt_wu(a.value, b.value);
+}
+
+simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
+                                             const simd32<uint32_t> b) {
+  return __lasx_xvslt_wu(b.value, a.value);
+}
+
+// ------------------------------------------------------------
+
+simdutf_really_inline simd32<uint32_t> as_vector_u32(const simd32<bool> v) {
+  return v.value;
+}
+/* end file src/simdutf/lasx/simd32-inl.h */
+
 } // namespace simd
 } // unnamed namespace
 } // namespace lasx
@@ -12578,6 +13487,7 @@ template <typename T> struct simd16x32 {
 /* end file src/simdutf/lasx/simd.h */
 
 /* begin file src/simdutf/lasx/end.h */
+#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 /* end file src/simdutf/lasx/end.h */
 
 #endif // SIMDUTF_IMPLEMENTATION_LASX
@@ -18563,6 +19473,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 /* begin file src/simdutf/arm64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
 // #define SIMDUTF_IMPLEMENTATION arm64
+#define SIMDUTF_SIMD_HAS_BYTEMASK 1
 /* end file src/simdutf/arm64/begin.h */
 namespace simdutf {
 namespace arm64 {
@@ -20439,6 +21350,64 @@ arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         reinterpret_cast<char *>(utf8_output));
 }
+
+template <endianness big_endian>
+simdutf_really_inline size_t
+arm64_utf8_length_from_utf16_bytemask(const char16_t *in, size_t size) {
+  size_t pos = 0;
+
+  constexpr size_t N = 8;
+  const auto one = vmovq_n_u16(1);
+  // each char16 yields at least one byte
+  size_t count = size / N * N;
+
+  for (; pos < size / N * N; pos += N) {
+    auto input = vld1q_u16(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(input)));
+    }
+    // 0xd800 .. 0xdbff - low surrogate
+    // 0xdc00 .. 0xdfff - high surrogate
+    const auto is_surrogate =
+        vceqq_u16(vandq_u16(input, vmovq_n_u16(0xf800)), vmovq_n_u16(0xd800));
+
+    // c0 - chars that yield 2- or 3-byte UTF-8 codes
+    const auto c0 = vminq_u16(vandq_u16(input, vmovq_n_u16(0xff80)), one);
+
+    // c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
+    const auto c1 = vminq_u16(vandq_u16(input, vmovq_n_u16(0xf800)), one);
+
+    /*
+        Explanation how the counting works.
+
+        In the case of a non-surrogate character we count:
+        * always 1 -- see how `count` is initialized above;
+        * c0 = 1 if the current char yields 2 or 3 bytes;
+        * c1 = 1 if the current char yields 3 bytes.
+
+        Thus, we always have correct count for the current char:
+        from 1, 2 or 3 bytes.
+
+        A trickier part is how we count surrogate pairs. Whether
+        we encounter a surrogate (low or high), we count it as
+        3 chars and then minus 1 (`is_surrogate` is -1 or 0).
+        Each surrogate char yields 2. A surrogate pair, that
+        is a low surrogate followed by a high one, yields
+        the expected 4 bytes.
+
+        It also correctly handles cases when low surrogate is
+        processed by the this loop, but high surrogate is counted
+        by the scalar procedure. The scalar procedure uses exactly
+        the described approach, thanks to that for valid UTF-16
+        strings it always count correctly.
+    */
+    auto v_count = vaddq_u16(c1, c0);
+    v_count = vaddq_u16(v_count, is_surrogate);
+    count += vaddlvq_u16(v_count);
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
 /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
 #endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF8
 
@@ -20519,6 +21488,26 @@ size_t encode_base64(char *dst, const char *src, size_t srclen,
     vst4q_u8(out, result);
     out += 64;
   }
+
+  if (i + 24 <= srclen) {
+    const uint8x8_t v3f_d = vdup_n_u8(0x3f);
+    const uint8x8x3_t in = vld3_u8((const uint8_t *)src + i);
+    uint8x8x4_t result;
+    result.val[0] = vshr_n_u8(in.val[0], 2);
+    result.val[1] =
+        vand_u8(vsli_n_u8(vshr_n_u8(in.val[1], 4), in.val[0], 4), v3f_d);
+    result.val[2] =
+        vand_u8(vsli_n_u8(vshr_n_u8(in.val[2], 6), in.val[1], 2), v3f_d);
+    result.val[3] = vand_u8(in.val[2], v3f_d);
+    result.val[0] = vqtbl4_u8(table, result.val[0]);
+    result.val[1] = vqtbl4_u8(table, result.val[1]);
+    result.val[2] = vqtbl4_u8(table, result.val[2]);
+    result.val[3] = vqtbl4_u8(table, result.val[3]);
+    vst4_u8(out, result);
+    out += 32;
+    i += 24;
+  }
+
   out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
                                             options);
 
@@ -20763,11 +21752,9 @@ void load_block(block64 *b, const char16_t *src) {
 void base64_decode_block(char *out, const char *src) {
   uint8x16x4_t str = vld4q_u8((uint8_t *)src);
   uint8x16x3_t outvec;
-  outvec.val[0] =
-      vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
-  outvec.val[1] =
-      vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
-  outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+  outvec.val[0] = vsliq_n_u8(vshrq_n_u8(str.val[1], 4), str.val[0], 2);
+  outvec.val[1] = vsliq_n_u8(vshrq_n_u8(str.val[2], 2), str.val[1], 4);
+  outvec.val[2] = vsliq_n_u8(str.val[3], str.val[2], 6);
   vst3q_u8((uint8_t *)out, outvec);
 }
 
@@ -23109,7 +24096,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (!match_system(big_endian)) {
       input = input.swap_bytes();
     }
-
     // 0xd800 .. 0xdbff - low surrogate
     // 0xdc00 .. 0xdfff - high surrogate
     const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
@@ -23152,7 +24138,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (iteration == 0) {
       count += v_count.sum();
       v_count = vector_u16::zero();
-
       iteration = max_iterations;
     }
   }
@@ -24597,12 +25582,13 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
 #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
     const char16_t *input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+  return arm64_utf8_length_from_utf16_bytemask<endianness::LITTLE>(input,
+                                                                   length);
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
     const char16_t *input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+  return arm64_utf8_length_from_utf16_bytemask<endianness::BIG>(input, length);
 }
 #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
 
@@ -27604,6 +28590,92 @@ simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
   }
 }
 /* end file src/icelake/icelake_convert_utf8_to_utf16.inl.cpp */
+/* begin file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */
+// This is translation of `utf8_length_from_utf16_bytemask` from
+// `generic/utf16.h`
+template <endianness big_endian>
+simdutf_really_inline size_t icelake_utf8_length_from_utf16(const char16_t *in,
+                                                            size_t size) {
+  size_t pos = 0;
+
+  using vector_u16 = simd16<uint16_t>;
+  constexpr size_t N = vector_u16::ELEMENTS;
+
+  const auto one = vector_u16::splat(1);
+
+  auto v_count = vector_u16::zero();
+
+  // each char16 yields at least one byte
+  size_t count = size / N * N;
+
+  // in a single iteration the increment is 0, 1 or 2, despite we have
+  // three additions
+  constexpr size_t max_iterations = 65535 / 2;
+  size_t iteration = max_iterations;
+
+  for (; pos < size / N * N; pos += N) {
+    auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input = input.swap_bytes();
+    }
+
+    // not_surrogate[i] = non-zero if i-th element is not a surrogate word
+    const auto not_surrogate = (input & uint16_t(0xf800)) ^ uint16_t(0xd800);
+
+    // not_surrogate[i] = 1 if surrogate word, 0 otherwise
+    const auto is_surrogate = min(not_surrogate, one) ^ one;
+
+    // c0 - chars that yield 2- or 3-byte UTF-8 codes
+    const auto c0 = min(input & uint16_t(0xff80), one);
+
+    // c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
+    const auto c1 = min(input & uint16_t(0xf800), one);
+
+    /*
+        Explanation how the counting works.
+
+        In the case of a non-surrogate character we count:
+        * always 1 -- see how `count` is initialized above;
+        * c0 = 1 if the current char yields 2 or 3 bytes;
+        * c1 = 1 if the current char yields 3 bytes.
+
+        Thus, we always have correct count for the current char:
+        from 1, 2 or 3 bytes.
+
+        A trickier part is how we count surrogate pairs. Whether
+        we encounter a surrogate (low or high), we count it as
+        3 chars and then minus 1 (`is_surrogate` is -1 or 0).
+        Each surrogate char yields 2. A surrogate pair, that
+        is a low surrogate followed by a high one, yields
+        the expected 4 bytes.
+
+        It also correctly handles cases when low surrogate is
+        processed by the this loop, but high surrogate is counted
+        by the scalar procedure. The scalar procedure uses exactly
+        the described approach, thanks to that for valid UTF-16
+        strings it always count correctly.
+    */
+    v_count += c0;
+    v_count += c1;
+    v_count -= is_surrogate;
+
+    iteration -= 1;
+    if (iteration == 0) {
+      count += v_count.sum();
+      v_count = vector_u16::zero();
+
+      iteration = max_iterations;
+    }
+  }
+
+  if (iteration > 0) {
+    count += v_count.sum();
+  }
+
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
+/* end file src/icelake/icelake_utf8_length_from_utf16.inl.cpp */
 #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
 
 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
@@ -29449,7 +30521,8 @@ namespace utf32 {
 
 template <typename T> T min(T a, T b) { return a <= b ? a : b; }
 
-size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
+simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
+                                                    size_t length) {
   using vector_u32 = simd32<uint32_t>;
 
   const char32_t *start = input;
@@ -29460,16 +30533,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
 
   const size_t N = vector_u32::ELEMENTS;
 
-  const auto one = vector_u32::splat(1);
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+  const auto v_0000007f = vector_u32::splat(0x0000007f);
+  const auto v_000007ff = vector_u32::splat(0x000007ff);
+  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
+#else
   const auto v_ffffff80 = vector_u32::splat(0xffffff80);
   const auto v_fffff800 = vector_u32::splat(0xfffff800);
   const auto v_ffff0000 = vector_u32::splat(0xffff0000);
+  const auto one = vector_u32::splat(1);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
   size_t counter = 0;
 
   // 1. vectorized loop unrolled 4 times
   {
-    // we use uint32 counters, this is
+    // we use vector of uint32 counters, this is why this limit is used
     const size_t max_iterations =
         std::numeric_limits<uint32_t>::max() / (max_increment * 4);
     size_t blocks = length / (N * 4);
@@ -29485,6 +30564,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         const auto in2 = vector_u32(input + 2 * N);
         const auto in3 = vector_u32(input + 3 * N);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in0 > v_0000007f);
+        acc -= as_vector_u32(in1 > v_0000007f);
+        acc -= as_vector_u32(in2 > v_0000007f);
+        acc -= as_vector_u32(in3 > v_0000007f);
+
+        acc -= as_vector_u32(in0 > v_000007ff);
+        acc -= as_vector_u32(in1 > v_000007ff);
+        acc -= as_vector_u32(in2 > v_000007ff);
+        acc -= as_vector_u32(in3 > v_000007ff);
+
+        acc -= as_vector_u32(in0 > v_0000ffff);
+        acc -= as_vector_u32(in1 > v_0000ffff);
+        acc -= as_vector_u32(in2 > v_0000ffff);
+        acc -= as_vector_u32(in3 > v_0000ffff);
+#else
         acc += min(one, in0 & v_ffffff80);
         acc += min(one, in1 & v_ffffff80);
         acc += min(one, in2 & v_ffffff80);
@@ -29499,6 +30594,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         acc += min(one, in1 & v_ffff0000);
         acc += min(one, in2 & v_ffff0000);
         acc += min(one, in3 & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += 4 * N;
       }
@@ -29521,9 +30617,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
       for (size_t i = 0; i < iterations; i++) {
         const auto in = vector_u32(input);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in > v_0000007f);
+        acc -= as_vector_u32(in > v_000007ff);
+        acc -= as_vector_u32(in > v_0000ffff);
+#else
         acc += min(one, in & v_ffffff80);
         acc += min(one, in & v_fffff800);
         acc += min(one, in & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += N;
       }
@@ -30840,83 +31942,12 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
 #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
     const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
-
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 32;
-      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-      __mmask32 two_bytes_bitmask =
-          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-      __mmask32 surrogates_bitmask =
-          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
-          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
-
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-      size_t three_bytes_count =
-          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               2 * surrogate_bytes_count;
-    }
-  }
-
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
-                     ptr, length - (ptr - input));
+  return icelake_utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
     const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
-
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-        0x0607040502030001, 0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-      ptr += 32;
-      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-      __mmask32 two_bytes_bitmask =
-          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-      __mmask32 surrogates_bitmask =
-          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
-          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
-
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-      size_t three_bytes_count =
-          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               2 * surrogate_bytes_count;
-    }
-  }
-
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
-                     ptr, length - (ptr - input));
+  return icelake_utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
 
@@ -31195,7 +32226,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* begin file src/simdutf/haswell/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
 // #define SIMDUTF_IMPLEMENTATION haswell
-#define SIMDUTF_SIMD_HAS_BYTEMASK
+#define SIMDUTF_SIMD_HAS_BYTEMASK 1
 
 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
 // nothing needed.
@@ -35354,7 +36385,8 @@ namespace utf32 {
 
 template <typename T> T min(T a, T b) { return a <= b ? a : b; }
 
-size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
+simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
+                                                    size_t length) {
   using vector_u32 = simd32<uint32_t>;
 
   const char32_t *start = input;
@@ -35365,16 +36397,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
 
   const size_t N = vector_u32::ELEMENTS;
 
-  const auto one = vector_u32::splat(1);
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+  const auto v_0000007f = vector_u32::splat(0x0000007f);
+  const auto v_000007ff = vector_u32::splat(0x000007ff);
+  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
+#else
   const auto v_ffffff80 = vector_u32::splat(0xffffff80);
   const auto v_fffff800 = vector_u32::splat(0xfffff800);
   const auto v_ffff0000 = vector_u32::splat(0xffff0000);
+  const auto one = vector_u32::splat(1);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
   size_t counter = 0;
 
   // 1. vectorized loop unrolled 4 times
   {
-    // we use uint32 counters, this is
+    // we use vector of uint32 counters, this is why this limit is used
     const size_t max_iterations =
         std::numeric_limits<uint32_t>::max() / (max_increment * 4);
     size_t blocks = length / (N * 4);
@@ -35390,6 +36428,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         const auto in2 = vector_u32(input + 2 * N);
         const auto in3 = vector_u32(input + 3 * N);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in0 > v_0000007f);
+        acc -= as_vector_u32(in1 > v_0000007f);
+        acc -= as_vector_u32(in2 > v_0000007f);
+        acc -= as_vector_u32(in3 > v_0000007f);
+
+        acc -= as_vector_u32(in0 > v_000007ff);
+        acc -= as_vector_u32(in1 > v_000007ff);
+        acc -= as_vector_u32(in2 > v_000007ff);
+        acc -= as_vector_u32(in3 > v_000007ff);
+
+        acc -= as_vector_u32(in0 > v_0000ffff);
+        acc -= as_vector_u32(in1 > v_0000ffff);
+        acc -= as_vector_u32(in2 > v_0000ffff);
+        acc -= as_vector_u32(in3 > v_0000ffff);
+#else
         acc += min(one, in0 & v_ffffff80);
         acc += min(one, in1 & v_ffffff80);
         acc += min(one, in2 & v_ffffff80);
@@ -35404,6 +36458,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         acc += min(one, in1 & v_ffff0000);
         acc += min(one, in2 & v_ffff0000);
         acc += min(one, in3 & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += 4 * N;
       }
@@ -35426,9 +36481,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
       for (size_t i = 0; i < iterations; i++) {
         const auto in = vector_u32(input);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in > v_0000007f);
+        acc -= as_vector_u32(in > v_000007ff);
+        acc -= as_vector_u32(in > v_0000ffff);
+#else
         acc += min(one, in & v_ffffff80);
         acc += min(one, in & v_fffff800);
         acc += min(one, in & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += N;
       }
@@ -35627,7 +36688,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (!match_system(big_endian)) {
       input = input.swap_bytes();
     }
-
     // 0xd800 .. 0xdbff - low surrogate
     // 0xdc00 .. 0xdfff - high surrogate
     const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
@@ -35670,7 +36730,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (iteration == 0) {
       count += v_count.sum();
       v_count = vector_u16::zero();
-
       iteration = max_iterations;
     }
   }
@@ -42236,7 +43295,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (!match_system(big_endian)) {
       input = input.swap_bytes();
     }
-
     // 0xd800 .. 0xdbff - low surrogate
     // 0xdc00 .. 0xdfff - high surrogate
     const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
@@ -42279,7 +43337,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (iteration == 0) {
       count += v_count.sum();
       v_count = vector_u16::zero();
-
       iteration = max_iterations;
     }
   }
@@ -42465,7 +43522,8 @@ namespace utf32 {
 
 template <typename T> T min(T a, T b) { return a <= b ? a : b; }
 
-size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
+simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
+                                                    size_t length) {
   using vector_u32 = simd32<uint32_t>;
 
   const char32_t *start = input;
@@ -42476,16 +43534,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
 
   const size_t N = vector_u32::ELEMENTS;
 
-  const auto one = vector_u32::splat(1);
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+  const auto v_0000007f = vector_u32::splat(0x0000007f);
+  const auto v_000007ff = vector_u32::splat(0x000007ff);
+  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
+#else
   const auto v_ffffff80 = vector_u32::splat(0xffffff80);
   const auto v_fffff800 = vector_u32::splat(0xfffff800);
   const auto v_ffff0000 = vector_u32::splat(0xffff0000);
+  const auto one = vector_u32::splat(1);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
   size_t counter = 0;
 
   // 1. vectorized loop unrolled 4 times
   {
-    // we use uint32 counters, this is
+    // we use vector of uint32 counters, this is why this limit is used
     const size_t max_iterations =
         std::numeric_limits<uint32_t>::max() / (max_increment * 4);
     size_t blocks = length / (N * 4);
@@ -42501,6 +43565,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         const auto in2 = vector_u32(input + 2 * N);
         const auto in3 = vector_u32(input + 3 * N);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in0 > v_0000007f);
+        acc -= as_vector_u32(in1 > v_0000007f);
+        acc -= as_vector_u32(in2 > v_0000007f);
+        acc -= as_vector_u32(in3 > v_0000007f);
+
+        acc -= as_vector_u32(in0 > v_000007ff);
+        acc -= as_vector_u32(in1 > v_000007ff);
+        acc -= as_vector_u32(in2 > v_000007ff);
+        acc -= as_vector_u32(in3 > v_000007ff);
+
+        acc -= as_vector_u32(in0 > v_0000ffff);
+        acc -= as_vector_u32(in1 > v_0000ffff);
+        acc -= as_vector_u32(in2 > v_0000ffff);
+        acc -= as_vector_u32(in3 > v_0000ffff);
+#else
         acc += min(one, in0 & v_ffffff80);
         acc += min(one, in1 & v_ffffff80);
         acc += min(one, in2 & v_ffffff80);
@@ -42515,6 +43595,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         acc += min(one, in1 & v_ffff0000);
         acc += min(one, in2 & v_ffff0000);
         acc += min(one, in3 & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += 4 * N;
       }
@@ -42537,9 +43618,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
       for (size_t i = 0; i < iterations; i++) {
         const auto in = vector_u32(input);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in > v_0000007f);
+        acc -= as_vector_u32(in > v_000007ff);
+        acc -= as_vector_u32(in > v_0000ffff);
+#else
         acc += min(one, in & v_ffffff80);
         acc += min(one, in & v_fffff800);
         acc += min(one, in & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += N;
       }
@@ -44458,7 +45545,7 @@ simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
   /* validate first three bytes */
   {
     size_t idx = 3;
-    while (idx < len && (src[idx] >> 6) == 0b10)
+    while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10)
       ++idx;
     if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
       return 0;
@@ -44525,7 +45612,7 @@ simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
   }
 
   /* we need to validate the last character */
-  while (tail < len && (src[0] >> 6) == 0b10)
+  while (tail < len && (uint8_t(src[0]) >> 6) == 0b10)
     --src, ++tail;
   return src - beg;
 }
@@ -45459,7 +46546,7 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
   /* validate first three bytes */
   if (validate) {
     size_t idx = 3;
-    while (idx < len && (src[idx] >> 6) == 0b10)
+    while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10)
       ++idx;
     if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
       return 0;
@@ -45694,9 +46781,9 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
 
   /* validate the last character and reparse it + tail */
   if (len > tail) {
-    if ((src[0] >> 6) == 0b10)
+    if ((uint8_t(src[0]) >> 6) == 0b10)
       --dst;
-    while ((src[0] >> 6) == 0b10 && tail < len)
+    while ((uint8_t(src[0]) >> 6) == 0b10 && tail < len)
       --src, ++tail;
     if (is16) {
       /* go back one more, when on high surrogate */
@@ -46156,7 +47243,7 @@ SIMDUTF_UNTARGET_REGION
 /* begin file src/simdutf/westmere/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
 // #define SIMDUTF_IMPLEMENTATION westmere
-#define SIMDUTF_SIMD_HAS_BYTEMASK
+#define SIMDUTF_SIMD_HAS_BYTEMASK 1
 
 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
@@ -48323,6 +49410,60 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
 
 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
 /* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
+struct expansion_result_t {
+  size_t u16count;
+  __m128i compressed;
+};
+
+// Function sse_expand_surrogate takes four **valid** UTF-32 characters
+// having at least one code-point producing a surrogate pair.
+template <endianness byte_order>
+expansion_result_t sse_expand_surrogate(const __m128i x) {
+  using vector_u32 = simd32<uint32_t>;
+  using vector_u8 = simd8<uint8_t>;
+
+  const auto in = vector_u32(x);
+
+  const auto non_surrogate_mask = (in & uint32_t(0xffff0000)) == uint32_t(0);
+  const auto mask = (~non_surrogate_mask.to_4bit_bitmask()) & 0xf;
+
+  const auto t0 = in - uint32_t(0x00010000);
+  const auto hi = t0.shr<10>() & uint32_t(0x000003ff);
+  const auto lo = t0.shl<16>() & uint32_t(0x03ff0000);
+  const auto surrogates = (lo | hi) | uint32_t(0xdc00d800);
+
+  const auto merged = as_vector_u8(select(non_surrogate_mask, in, surrogates));
+
+  const auto shuffle = vector_u8::load(
+      (byte_order == endianness::LITTLE)
+          ? tables::utf32_to_utf16::pack_utf32_to_utf16le[mask]
+          : tables::utf32_to_utf16::pack_utf32_to_utf16be[mask]);
+
+  const size_t u16count = (4 + count_ones(mask));
+  const auto compressed = shuffle.lookup_16(merged);
+
+  return {u16count, compressed};
+}
+
+// Function `validate_utf32` checks 2 x 4 UTF-32 characters for their validity.
+simdutf_really_inline bool validate_utf32(const __m128i a, const __m128i b) {
+  using vector_u32 = simd32<uint32_t>;
+
+  const auto in0 = vector_u32(a);
+  const auto in1 = vector_u32(b);
+
+  const auto standardmax = vector_u32::splat(0x10ffff);
+  const auto offset = vector_u32::splat(0xffff2000);
+  const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
+
+  const auto too_large = max(in0, in1) > standardmax;
+  const auto surrogate0 = (in0 + offset) > standardoffsetmax;
+  const auto surrogate1 = (in1 + offset) > standardoffsetmax;
+
+  const auto combined = too_large | surrogate0 | surrogate1;
+  return !combined.any();
+}
+
 template <endianness big_endian>
 std::pair<const char32_t *, char16_t *>
 sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
@@ -48333,66 +49474,61 @@ sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
   __m128i forbidden_bytemask = _mm_setzero_si128();
 
-  while (end - buf >= 8) {
-    const __m128i in = _mm_loadu_si128((__m128i *)buf);
-    const __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+  while (end - buf >= 16 + 8) {
+    const __m128i *ptr = reinterpret_cast<const __m128i *>(buf);
+    const __m128i in0 = _mm_loadu_si128(ptr + 0);
+    const __m128i in1 = _mm_loadu_si128(ptr + 1);
+    const __m128i in2 = _mm_loadu_si128(ptr + 2);
+    const __m128i in3 = _mm_loadu_si128(ptr + 3);
 
-    const __m128i combined = _mm_or_si128(in, nextin);
+    const __m128i combined =
+        _mm_or_si128(_mm_or_si128(in2, in3), _mm_or_si128(in0, in1));
     if (simdutf_likely(_mm_testz_si128(combined, v_ffff0000))) {
       // No bits set above 16th, directly pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+      __m128i utf16_packed0 = _mm_packus_epi32(in0, in1);
+      __m128i utf16_packed1 = _mm_packus_epi32(in2, in3);
 
       const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
       const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
       forbidden_bytemask = _mm_or_si128(
           forbidden_bytemask,
-          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+          _mm_or_si128(
+              _mm_cmpeq_epi16(_mm_and_si128(utf16_packed0, v_f800), v_d800),
+              _mm_cmpeq_epi16(_mm_and_si128(utf16_packed1, v_f800), v_d800)));
 
       if (big_endian) {
         const __m128i swap =
             _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+        utf16_packed0 = _mm_shuffle_epi8(utf16_packed0, swap);
+        utf16_packed1 = _mm_shuffle_epi8(utf16_packed1, swap);
       }
 
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
+      _mm_storeu_si128((__m128i *)utf16_output + 0, utf16_packed0);
+      _mm_storeu_si128((__m128i *)utf16_output + 1, utf16_packed1);
+      utf16_output += 16;
+      buf += 16;
     } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf16_output);
-          }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf16_output);
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
+      if (!validate_utf32(in0, in1) || !validate_utf32(in2, in3)) {
+        return std::make_pair(nullptr, utf16_output);
       }
-      buf += k;
+
+      const auto ret0 = sse_expand_surrogate<big_endian>(in0);
+      _mm_storeu_si128((__m128i *)utf16_output, ret0.compressed);
+      utf16_output += ret0.u16count;
+
+      const auto ret1 = sse_expand_surrogate<big_endian>(in1);
+      _mm_storeu_si128((__m128i *)utf16_output, ret1.compressed);
+      utf16_output += ret1.u16count;
+
+      const auto ret2 = sse_expand_surrogate<big_endian>(in2);
+      _mm_storeu_si128((__m128i *)utf16_output, ret2.compressed);
+      utf16_output += ret2.u16count;
+
+      const auto ret3 = sse_expand_surrogate<big_endian>(in3);
+      _mm_storeu_si128((__m128i *)utf16_output, ret3.compressed);
+      utf16_output += ret3.u16count;
+
+      buf += 16;
     }
   }
 
@@ -50253,7 +51389,8 @@ namespace utf32 {
 
 template <typename T> T min(T a, T b) { return a <= b ? a : b; }
 
-size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
+simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
+                                                    size_t length) {
   using vector_u32 = simd32<uint32_t>;
 
   const char32_t *start = input;
@@ -50264,16 +51401,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
 
   const size_t N = vector_u32::ELEMENTS;
 
-  const auto one = vector_u32::splat(1);
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+  const auto v_0000007f = vector_u32::splat(0x0000007f);
+  const auto v_000007ff = vector_u32::splat(0x000007ff);
+  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
+#else
   const auto v_ffffff80 = vector_u32::splat(0xffffff80);
   const auto v_fffff800 = vector_u32::splat(0xfffff800);
   const auto v_ffff0000 = vector_u32::splat(0xffff0000);
+  const auto one = vector_u32::splat(1);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
   size_t counter = 0;
 
   // 1. vectorized loop unrolled 4 times
   {
-    // we use uint32 counters, this is
+    // we use vector of uint32 counters, this is why this limit is used
     const size_t max_iterations =
         std::numeric_limits<uint32_t>::max() / (max_increment * 4);
     size_t blocks = length / (N * 4);
@@ -50289,6 +51432,22 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         const auto in2 = vector_u32(input + 2 * N);
         const auto in3 = vector_u32(input + 3 * N);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in0 > v_0000007f);
+        acc -= as_vector_u32(in1 > v_0000007f);
+        acc -= as_vector_u32(in2 > v_0000007f);
+        acc -= as_vector_u32(in3 > v_0000007f);
+
+        acc -= as_vector_u32(in0 > v_000007ff);
+        acc -= as_vector_u32(in1 > v_000007ff);
+        acc -= as_vector_u32(in2 > v_000007ff);
+        acc -= as_vector_u32(in3 > v_000007ff);
+
+        acc -= as_vector_u32(in0 > v_0000ffff);
+        acc -= as_vector_u32(in1 > v_0000ffff);
+        acc -= as_vector_u32(in2 > v_0000ffff);
+        acc -= as_vector_u32(in3 > v_0000ffff);
+#else
         acc += min(one, in0 & v_ffffff80);
         acc += min(one, in1 & v_ffffff80);
         acc += min(one, in2 & v_ffffff80);
@@ -50303,6 +51462,7 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
         acc += min(one, in1 & v_ffff0000);
         acc += min(one, in2 & v_ffff0000);
         acc += min(one, in3 & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += 4 * N;
       }
@@ -50325,9 +51485,15 @@ size_t utf8_length_from_utf32(const char32_t *input, size_t length) {
       for (size_t i = 0; i < iterations; i++) {
         const auto in = vector_u32(input);
 
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in > v_0000007f);
+        acc -= as_vector_u32(in > v_000007ff);
+        acc -= as_vector_u32(in > v_0000ffff);
+#else
         acc += min(one, in & v_ffffff80);
         acc += min(one, in & v_fffff800);
         acc += min(one, in & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 
         input += N;
       }
@@ -50524,7 +51690,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (!match_system(big_endian)) {
       input = input.swap_bytes();
     }
-
     // 0xd800 .. 0xdbff - low surrogate
     // 0xdc00 .. 0xdfff - high surrogate
     const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
@@ -50567,7 +51732,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (iteration == 0) {
       count += v_count.sum();
       v_count = vector_u16::zero();
-
       iteration = max_iterations;
     }
   }
@@ -52729,6 +53893,7 @@ SIMDUTF_UNTARGET_REGION
 /* begin file src/simdutf/lsx/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "lsx"
 // #define SIMDUTF_IMPLEMENTATION lsx
+#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
 /* end file src/simdutf/lsx/begin.h */
 namespace simdutf {
 namespace lsx {
@@ -52764,10 +53929,7 @@ const uint8_t lsx_1_2_utf8_bytes_mask[] = {
 
 #if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
 simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
-  // const v16u8 shuf = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-  // return __lsx_vshuf_b(__lsx_vldi(0), vec, shuf);
   return __lsx_vshuf4i_b(vec, 0b10110001);
-  // return __lsx_vor_v(__lsx_vslli_h(vec, 8), __lsx_vsrli_h(vec, 8));
 }
 #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32
 
@@ -52839,7 +54001,7 @@ convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
   __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
   // 1 byte: 00000000 00000000
   // 2 byte: 00000aaa aa000000
-  const __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+  const __m128i v1f00 = lsx_splat_u16(0x1f00);
   __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
   // Combine with a shift right accumulate
   // 1 byte: 00000000 0bbbbbbb
@@ -52869,15 +54031,14 @@ simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> in0,
 #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
 #if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
 /* begin file src/lsx/lsx_validate_utf32le.cpp */
-
 const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) {
   const char32_t *end = input + size;
 
-  __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
-  __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
-  __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
-  __m128i currentmax = __lsx_vldi(0x0);
-  __m128i currentoffsetmax = __lsx_vldi(0x0);
+  __m128i offset = lsx_splat_u32(0xffff2000);
+  __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff);
+  __m128i standardmax = lsx_splat_u32(0x10ffff);
+  __m128i currentmax = lsx_splat_u32(0);
+  __m128i currentoffsetmax = lsx_splat_u32(0);
 
   while (input + 4 < end) {
     __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
@@ -52909,11 +54070,11 @@ const result lsx_validate_utf32le_with_errors(const char32_t *input,
   const char32_t *start = input;
   const char32_t *end = input + size;
 
-  __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
-  __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
-  __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
-  __m128i currentmax = __lsx_vldi(0x0);
-  __m128i currentoffsetmax = __lsx_vldi(0x0);
+  __m128i offset = lsx_splat_u32(0xffff2000);
+  __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff);
+  __m128i standardmax = lsx_splat_u32(0x10ffff);
+  __m128i currentmax = lsx_splat_u32(0);
+  __m128i currentoffsetmax = lsx_splat_u32(0);
 
   while (input + 4 < end) {
     __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
@@ -52975,7 +54136,7 @@ lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
     // t0 = [0000|00aa|bbbb|bb00]
     __m128i t0 = __lsx_vslli_h(in16, 2);
     // t1 = [0000|00aa|0000|0000]
-    __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785));
+    __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x300));
     // t3 = [0000|00aa|00bb|bbbb]
     __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f));
     // t4 = [1100|00aa|10bb|bbbb]
@@ -53191,7 +54352,7 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // 1 byte: 00000000 00000000
     // 2 byte: xx0bbbbb 00000000
     // 3 byte: xxbbbbbb 00000000
-    __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+    __m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00));
     // 1 byte: 00000000 0ccccccc
     // 2 byte: 0010bbbb bbcccccc
     // 3 byte: 0010bbbb bbcccccc
@@ -53243,10 +54404,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
       //                   = +0xDC00|0xE7C0
       __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
       // Generate unadjusted trail surrogate minus lowest 2 bits
-      // vec(0000FF00) = __lsx_vldi(-1758)
       // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
-      __m128i trail =
-          __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+      __m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000ff00));
       // Insert low 2 bits of trail surrogate to magic number for later
       // 11011100 00000000 11100111 110000cc
       __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
@@ -53259,10 +54418,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
           __lsx_vrepli_h(0x3f /* 0x003f*/));
 
       // Blend pairs
-      // __lsx_vldi(-1741) => vec(0x0000FFFF)
       // 000000cc ccdddddd|11110aaa bbbbbb00
-      __m128i blend =
-          __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
+      __m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF));
 
       // Add magic number to finish the result
       // 110111CC CCDDDDDD|110110AA BBBBBBCC
@@ -53301,13 +54458,12 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // first.
     __m128i middlehigh = __lsx_vslli_w(perm, 2);
     // 00000000 00000000 00cccccc 00000000
-    __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+    __m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00));
     // Start assembling the sequence. Since the 4th byte is in the same position
     // as it would be in a surrogate and there is no dependency, shift left
     // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
     // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
-    __m128i ab =
-        __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+    __m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000));
     // Top 16 bits contains the high ten bits of the surrogate pair before
     // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
     // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
@@ -53321,8 +54477,7 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // After this is for surrogates
     // Blend the low and high surrogates
     // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
-    __m128i mixed =
-        __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+    __m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF));
     // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
     // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
     // 11110aaa bbbbbbcc|000000cc ccdddddd
@@ -53476,14 +54631,13 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     // The top bits will be corrected later in the bsl
     // 00000000 10bbbbbb 00000000
     __m128i middle =
-        __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+        __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
     // Combine low and middle with shift right accumulate
     // 00000000 00xxbbbb bbcccccc
     __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
     // Insert top 4 bits from high byte with bitwise select
     // 00000000 aaaabbbb bbcccccc
-    __m128i composed =
-        __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+    __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
     __lsx_vst(composed, utf32_output, 0);
     utf32_output += 4; // We wrote 4 32-bit characters.
     return consumed;
@@ -53512,10 +54666,10 @@ size_t convert_masked_utf8_to_utf32(const char *input,
       __m128i merge2 =
           __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
                           __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
-                          __lsx_vldi(-2545));        /*0x00000FFF*/
+                          lsx_splat_u32(0x00000FFF));
       // Clear the garbage
       // 00000000 000aaabb bbbbcccc ccdddddd
-      __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+      __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
       // Store
       __lsx_vst(composed, utf32_output, 0);
       utf32_output += 3; // We wrote 3 32-bit characters.
@@ -53535,11 +54689,11 @@ size_t convert_masked_utf8_to_utf32(const char *input,
 
     // Ascii
     __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
-    __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+    __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
     // 00000000 00000000 0000cccc ccdddddd
     __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);
 
-    __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+    __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
     __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
     // Insert twice
     // 00000000 000aaabb bbbbxxxx xxxxxxxx
@@ -53549,8 +54703,7 @@ size_t convert_masked_utf8_to_utf32(const char *input,
         __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
     ab = __lsx_vsrli_w(ab, 4);
     // 00000000 000aaabb bbbbcccc ccdddddd
-    __m128i composed =
-        __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+    __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
     // Store
     __lsx_vst(composed, utf32_output, 0);
     utf32_output += 3; // We wrote 3 32-bit characters.
@@ -53820,7 +54973,7 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       // t0 = [000a|aaaa|bbbb|bb00]
       __m128i t0 = __lsx_vslli_h(in, 2);
       // t1 = [000a|aaaa|0000|0000]
-      __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+      __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
       // t2 = [0000|0000|00bb|bbbb]
       __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
       // t3 = [000a|aaaa|00bb|bbbb]
@@ -53846,9 +54999,8 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       utf8_output += row[0];
       continue;
     }
-    __m128i surrogates_bytemask =
-        __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
-                     __lsx_vldi(-2600 /*0xD800*/));
+    __m128i surrogates_bytemask = __lsx_vseq_h(
+        __lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
     if (__lsx_bz_v(surrogates_bytemask)) {
@@ -53888,14 +55040,14 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
       __m128i t1 = __lsx_vand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
+      __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
 
       // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
       __m128i s0 = __lsx_vsrli_h(in, 12);
       // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
       __m128i s1 = __lsx_vslli_h(in, 2);
       // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
-      s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+      s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00));
 
       // [00bb|bbbb|0000|aaaa]
       __m128i s2 = __lsx_vor_v(s0, s1);
@@ -53903,8 +55055,8 @@ lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
       __m128i s3 = __lsx_vor_v(s2, v_c0e0);
       __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
-      __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
-                                 __lsx_vldi(-2752 /*0x4000*/));
+      __m128i m0 =
+          __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
       __m128i s4 = __lsx_vxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
@@ -54059,7 +55211,7 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       // t0 = [000a|aaaa|bbbb|bb00]
       __m128i t0 = __lsx_vslli_h(in, 2);
       // t1 = [000a|aaaa|0000|0000]
-      __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+      __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
       // t2 = [0000|0000|00bb|bbbb]
       __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
       // t3 = [000a|aaaa|00bb|bbbb]
@@ -54085,9 +55237,8 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       utf8_output += row[0];
       continue;
     }
-    __m128i surrogates_bytemask =
-        __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
-                     __lsx_vldi(-2600 /*0xD800*/));
+    __m128i surrogates_bytemask = __lsx_vseq_h(
+        __lsx_vand_v(in, lsx_splat_u16(0xf800)), lsx_splat_u16(0xd800));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
     if (__lsx_bz_v(surrogates_bytemask)) {
@@ -54127,14 +55278,14 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
       __m128i t1 = __lsx_vand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688));
+      __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
 
       // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
       __m128i s0 = __lsx_vsrli_h(in, 12);
       // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
       __m128i s1 = __lsx_vslli_h(in, 2);
       // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
-      s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+      s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3f00));
 
       // [00bb|bbbb|0000|aaaa]
       __m128i s2 = __lsx_vor_v(s0, s1);
@@ -54142,8 +55293,8 @@ lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
       __m128i s3 = __lsx_vor_v(s2, v_c0e0);
       __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
-      __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
-                                 __lsx_vldi(-2752 /*0x4000*/));
+      __m128i m0 =
+          __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
       __m128i s4 = __lsx_vxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
@@ -54249,8 +55400,8 @@ lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
   const char16_t *end = buf + len;
 
   __m128i zero = __lsx_vldi(0);
-  __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
-  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_f800 = lsx_splat_u16(0xf800);
+  __m128i v_d800 = lsx_splat_u16(0xd800);
 
   while (end - buf >= 8) {
     __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
@@ -54322,8 +55473,8 @@ lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
   const char16_t *end = buf + len;
 
   __m128i zero = __lsx_vldi(0);
-  __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
-  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_f800 = lsx_splat_u16(0xf800);
+  __m128i v_d800 = lsx_splat_u16(0xd800);
 
   while (end - buf >= 8) {
     __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
@@ -54458,10 +55609,10 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
   uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char32_t *end = buf + len;
 
-  __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
-  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
-  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
-  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_c080 = lsx_splat_u16(0xc080);
+  __m128i v_07ff = lsx_splat_u16(0x07ff);
+  __m128i v_dfff = lsx_splat_u16(0xdfff);
+  __m128i v_d800 = lsx_splat_u16(0xd800);
   __m128i forbidden_bytemask = __lsx_vldi(0x0);
 
   const size_t safety_margin =
@@ -54499,7 +55650,7 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
         // t0 = [000a|aaaa|bbbb|bb00]
         const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
         // t1 = [000a|aaaa|0000|0000]
-        const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+        const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
         // t2 = [0000|0000|00bb|bbbb]
         const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
         // t3 = [000a|aaaa|00bb|bbbb]
@@ -54568,23 +55719,22 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
         __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
         __m128i t1 = __lsx_vand_v(t0, v_3f7f);
         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
+        __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
 
         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
         __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
         __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+        s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00));
         // [00bb|bbbb|0000|aaaa]
         __m128i s2 = __lsx_vor_v(s0, s1);
         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
         __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
         __m128i s3 = __lsx_vor_v(s2, v_c0e0);
-        // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
         __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
-        __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
-                                   __lsx_vldi(-2752 /*0x4000*/));
+        __m128i m0 =
+            __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
         __m128i s4 = __lsx_vxor_v(s3, m0);
 
         // 4. expand code units 16-bit => 32-bit
@@ -54678,6 +55828,7 @@ lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
   if (__lsx_bnz_v(forbidden_bytemask)) {
     return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
   }
+
   return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
@@ -54688,10 +55839,10 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
   const char32_t *start = buf;
   const char32_t *end = buf + len;
 
-  __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
-  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
-  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
-  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_c080 = lsx_splat_u16(0xc080);
+  __m128i v_07ff = lsx_splat_u16(0x07ff);
+  __m128i v_dfff = lsx_splat_u16(0xdfff);
+  __m128i v_d800 = lsx_splat_u16(0xd800);
   __m128i forbidden_bytemask = __lsx_vldi(0x0);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
@@ -54728,7 +55879,7 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         // t0 = [000a|aaaa|bbbb|bb00]
         const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
         // t1 = [000a|aaaa|0000|0000]
-        const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+        const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
         // t2 = [0000|0000|00bb|bbbb]
         const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
         // t3 = [000a|aaaa|00bb|bbbb]
@@ -54801,14 +55952,14 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
         __m128i t1 = __lsx_vand_v(t0, v_3f7f);
         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
+        __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));
 
         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
         __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
         __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+        s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00));
         // [00bb|bbbb|0000|aaaa]
         __m128i s2 = __lsx_vor_v(s0, s1);
         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
@@ -54816,8 +55967,8 @@ lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         __m128i s3 = __lsx_vor_v(s2, v_c0e0);
         // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
         __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
-        __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
-                                   __lsx_vldi(-2752 /*0x4000*/));
+        __m128i m0 =
+            __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
         __m128i s4 = __lsx_vxor_v(s3, m0);
 
         // 4. expand code units 16-bit => 32-bit
@@ -54924,8 +56075,8 @@ lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   const char32_t *end = buf + len;
 
   __m128i forbidden_bytemask = __lsx_vrepli_h(0);
-  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
-  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
+  __m128i v_d800 = lsx_splat_u16(0xd800);
+  __m128i v_dfff = lsx_splat_u16(0xdfff);
   while (end - buf >= 8) {
     __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
     __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
@@ -55000,8 +56151,8 @@ lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
   const char32_t *end = buf + len;
 
   __m128i forbidden_bytemask = __lsx_vrepli_h(0);
-  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
-  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
+  __m128i v_d800 = lsx_splat_u16(0xd800);
+  __m128i v_dfff = lsx_splat_u16(0xdfff);
 
   while (end - buf >= 8) {
     __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
@@ -55444,9 +56595,8 @@ static inline void load_block(block64 *b, const char16_t *src) {
 static inline void base64_decode(char *out, __m128i str) {
   __m128i t0 = __lsx_vor_v(
       __lsx_vslli_w(str, 26),
-      __lsx_vslli_w(__lsx_vand_v(str, __lsx_vldi(-1758 /*0x0000FF00*/)), 12));
-  __m128i t1 =
-      __lsx_vsrli_w(__lsx_vand_v(str, __lsx_vldi(-3521 /*0x003F0000*/)), 2);
+      __lsx_vslli_w(__lsx_vand_v(str, lsx_splat_u32(0x0000FF00)), 12));
+  __m128i t1 = __lsx_vsrli_w(__lsx_vand_v(str, lsx_splat_u32(0x003F0000)), 2);
   __m128i t2 = __lsx_vor_v(t0, t1);
   __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16));
   const v16u8 pack_shuffle = {3, 2,  1,  7,  6, 5, 11, 10,
@@ -57511,7 +58661,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (!match_system(big_endian)) {
       input = input.swap_bytes();
     }
-
     // 0xd800 .. 0xdbff - low surrogate
     // 0xdc00 .. 0xdfff - high surrogate
     const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
@@ -57554,7 +58703,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (iteration == 0) {
       count += v_count.sum();
       v_count = vector_u16::zero();
-
       iteration = max_iterations;
     }
   }
@@ -57732,6 +58880,147 @@ const result validate_utf16_with_errors(const char16_t *input, size_t size) {
 /* end file src/generic/validate_utf16.h */
 #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
 
+#if SIMDUTF_FEATURE_UTF32
+/* begin file src/generic/utf32.h */
+#include <limits>
+
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf32 {
+
+template <typename T> T min(T a, T b) { return a <= b ? a : b; }
+
+simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
+                                                    size_t length) {
+  using vector_u32 = simd32<uint32_t>;
+
+  const char32_t *start = input;
+
+  // we add up to three ones in a single iteration (see the vectorized loop in
+  // section #2 below)
+  const size_t max_increment = 3;
+
+  const size_t N = vector_u32::ELEMENTS;
+
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+  const auto v_0000007f = vector_u32::splat(0x0000007f);
+  const auto v_000007ff = vector_u32::splat(0x000007ff);
+  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
+#else
+  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
+  const auto v_fffff800 = vector_u32::splat(0xfffff800);
+  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
+  const auto one = vector_u32::splat(1);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+
+  size_t counter = 0;
+
+  // 1. vectorized loop unrolled 4 times
+  {
+    // we use vector of uint32 counters, this is why this limit is used
+    const size_t max_iterations =
+        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
+    size_t blocks = length / (N * 4);
+    length -= blocks * (N * 4);
+    while (blocks != 0) {
+      const size_t iterations = min(blocks, max_iterations);
+      blocks -= iterations;
+
+      simd32<uint32_t> acc = vector_u32::zero();
+      for (size_t i = 0; i < iterations; i++) {
+        const auto in0 = vector_u32(input + 0 * N);
+        const auto in1 = vector_u32(input + 1 * N);
+        const auto in2 = vector_u32(input + 2 * N);
+        const auto in3 = vector_u32(input + 3 * N);
+
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in0 > v_0000007f);
+        acc -= as_vector_u32(in1 > v_0000007f);
+        acc -= as_vector_u32(in2 > v_0000007f);
+        acc -= as_vector_u32(in3 > v_0000007f);
+
+        acc -= as_vector_u32(in0 > v_000007ff);
+        acc -= as_vector_u32(in1 > v_000007ff);
+        acc -= as_vector_u32(in2 > v_000007ff);
+        acc -= as_vector_u32(in3 > v_000007ff);
+
+        acc -= as_vector_u32(in0 > v_0000ffff);
+        acc -= as_vector_u32(in1 > v_0000ffff);
+        acc -= as_vector_u32(in2 > v_0000ffff);
+        acc -= as_vector_u32(in3 > v_0000ffff);
+#else
+        acc += min(one, in0 & v_ffffff80);
+        acc += min(one, in1 & v_ffffff80);
+        acc += min(one, in2 & v_ffffff80);
+        acc += min(one, in3 & v_ffffff80);
+
+        acc += min(one, in0 & v_fffff800);
+        acc += min(one, in1 & v_fffff800);
+        acc += min(one, in2 & v_fffff800);
+        acc += min(one, in3 & v_fffff800);
+
+        acc += min(one, in0 & v_ffff0000);
+        acc += min(one, in1 & v_ffff0000);
+        acc += min(one, in2 & v_ffff0000);
+        acc += min(one, in3 & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+
+        input += 4 * N;
+      }
+
+      counter += acc.sum();
+    }
+  }
+
+  // 2. vectorized loop for tail
+  {
+    const size_t max_iterations =
+        std::numeric_limits<uint32_t>::max() / max_increment;
+    size_t blocks = length / N;
+    length -= blocks * N;
+    while (blocks != 0) {
+      const size_t iterations = min(blocks, max_iterations);
+      blocks -= iterations;
+
+      auto acc = vector_u32::zero();
+      for (size_t i = 0; i < iterations; i++) {
+        const auto in = vector_u32(input);
+
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in > v_0000007f);
+        acc -= as_vector_u32(in > v_000007ff);
+        acc -= as_vector_u32(in > v_0000ffff);
+#else
+        acc += min(one, in & v_ffffff80);
+        acc += min(one, in & v_fffff800);
+        acc += min(one, in & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+
+        input += N;
+      }
+
+      counter += acc.sum();
+    }
+  }
+
+  const size_t consumed = input - start;
+  if (consumed != 0) {
+    // We don't count 0th bytes in the vectorized loops above, this
+    // is why we need to count them in the end.
+    counter += consumed;
+  }
+
+  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+} // namespace utf32
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf32.h */
+#endif // SIMDUTF_FEATURE_UTF32
+
 //
 // Implementation-specific overrides
 //
@@ -58678,39 +59967,14 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
 #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m128i v_80 = __lsx_vrepli_w(0x80); /*0x00000080*/
-  const __m128i v_800 = __lsx_vldi(-3832);   /*0x00000800*/
-  const __m128i v_10000 = __lsx_vldi(-3583); /*0x00010000*/
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
-    const __m128i ascii_bytes_bytemask = __lsx_vslt_w(in, v_80);
-    const __m128i one_two_bytes_bytemask = __lsx_vslt_w(in, v_800);
-    const __m128i two_bytes_bytemask =
-        __lsx_vxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m128i three_bytes_bytemask =
-        __lsx_vxor_v(__lsx_vslt_w(in, v_10000), one_two_bytes_bytemask);
-
-    const uint32_t ascii_bytes_count = __lsx_vpickve2gr_bu(
-        __lsx_vpcnt_b(__lsx_vmskltz_w(ascii_bytes_bytemask)), 0);
-    const uint32_t two_bytes_count = __lsx_vpickve2gr_bu(
-        __lsx_vpcnt_b(__lsx_vmskltz_w(two_bytes_bytemask)), 0);
-    const uint32_t three_bytes_count = __lsx_vpickve2gr_bu(
-        __lsx_vpcnt_b(__lsx_vmskltz_w(three_bytes_bytemask)), 0);
-
-    count +=
-        16 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
-  }
-  return count +
-         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+  return utf32::utf8_length_from_utf32(input, length);
 }
 #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
 
 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
+  const __m128i v_ffff = lsx_splat_u32(0x0000ffff);
   size_t pos = 0;
   size_t count = 0;
   for (; pos + 4 <= length; pos += 4) {
@@ -58835,6 +60099,7 @@ size_t implementation::binary_to_base64(const char *input, size_t length,
 } // namespace simdutf
 
 /* begin file src/simdutf/lsx/end.h */
+#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 /* end file src/simdutf/lsx/end.h */
 /* end file src/lsx/implementation.cpp */
 #endif
@@ -58843,6 +60108,7 @@ size_t implementation::binary_to_base64(const char *input, size_t length,
 /* begin file src/simdutf/lasx/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "lasx"
 // #define SIMDUTF_IMPLEMENTATION lasx
+#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
 /* end file src/simdutf/lasx/begin.h */
 namespace simdutf {
 namespace lasx {
@@ -58953,7 +60219,7 @@ convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
   __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
   // 1 byte: 00000000 00000000
   // 2 byte: 00000aaa aa000000
-  __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+  __m128i v1f00 = lsx_splat_u16(0x1f00);
   __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
   // Combine with a shift right accumulate
   // 1 byte: 00000000 0bbbbbbb
@@ -58983,7 +60249,6 @@ simd8<uint8_t> utf16_gather_high_bytes(const simd16<uint16_t> in0,
 #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
 #if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
 /* begin file src/lasx/lasx_validate_utf32le.cpp */
-
 const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
   const char32_t *end = input + size;
 
@@ -58995,9 +60260,9 @@ const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
     }
   }
 
-  __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
-  __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
-  __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+  __m256i offset = lasx_splat_u32(0xffff2000);
+  __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff);
+  __m256i standardmax = lasx_splat_u32(0x10ffff);
   __m256i currentmax = __lasx_xvldi(0x0);
   __m256i currentoffsetmax = __lasx_xvldi(0x0);
 
@@ -59040,9 +60305,9 @@ const result lasx_validate_utf32le_with_errors(const char32_t *input,
     input++;
   }
 
-  __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
-  __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
-  __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+  __m256i offset = lasx_splat_u32(0xffff2000);
+  __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff);
+  __m256i standardmax = lasx_splat_u32(0x10ffff);
   __m256i currentmax = __lasx_xvldi(0x0);
   __m256i currentoffsetmax = __lasx_xvldi(0x0);
 
@@ -59083,11 +60348,11 @@ lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
                             char *utf8_out) {
   uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const size_t safety_margin = 12;
-  const char *end = latin1_input + len - safety_margin;
+  const char *end = latin1_input + len;
 
   // We always write 16 bytes, of which more than the first 8 bytes
   // are valid. A safety margin of 8 is more than sufficient.
-  while (end - latin1_input >= 16) {
+  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
     __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
     uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0);
     if (ascii_mask == 0xFFFF) {
@@ -59105,7 +60370,7 @@ lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
     // t0 = [0000|00aa|bbbb|bb00]
     __m256i t0 = __lasx_xvslli_h(in16, 2);
     // t1 = [0000|00aa|0000|0000]
-    __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785));
+    __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x300));
     // t3 = [0000|00aa|00bb|bbbb]
     __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f));
     // t4 = [1100|00aa|10bb|bbbb]
@@ -59166,7 +60431,7 @@ lasx_convert_latin1_to_utf16le(const char *buf, size_t len,
     buf += 32;
   }
 
-  if (buf + 16 <= end) {
+  if (end - buf >= 16) {
     __m128i zero = __lsx_vldi(0);
     __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
 
@@ -59191,7 +60456,7 @@ lasx_convert_latin1_to_utf16be(const char *buf, size_t len,
   }
 
   __m256i zero = __lasx_xvldi(0);
-  while (buf + 32 <= end) {
+  while (end - buf >= 32) {
     __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
 
     __m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000);
@@ -59204,7 +60469,7 @@ lasx_convert_latin1_to_utf16be(const char *buf, size_t len,
     buf += 32;
   }
 
-  if (buf + 16 <= end) {
+  if (end - buf >= 16) {
     __m128i zero_128 = __lsx_vldi(0);
     __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
 
@@ -59402,7 +60667,7 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // 1 byte: 00000000 00000000
     // 2 byte: xx0bbbbb 00000000
     // 3 byte: xxbbbbbb 00000000
-    __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+    __m128i middlebyte = __lsx_vand_v(lowperm, lsx_splat_u16(0xFF00));
     // 1 byte: 00000000 0ccccccc
     // 2 byte: 0010bbbb bbcccccc
     // 3 byte: 0010bbbb bbcccccc
@@ -59454,10 +60719,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
       //                   = +0xDC00|0xE7C0
       __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
       // Generate unadjusted trail surrogate minus lowest 2 bits
-      // vec(0000FF00) = __lsx_vldi(-1758)
       // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
-      __m128i trail =
-          __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+      __m128i trail = __lsx_vbitsel_v(shift, swap, lsx_splat_u32(0x0000FF00));
       // Insert low 2 bits of trail surrogate to magic number for later
       // 11011100 00000000 11100111 110000cc
       __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
@@ -59470,10 +60733,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
           __lsx_vrepli_h(0x3f /* 0x003f*/));
 
       // Blend pairs
-      // __lsx_vldi(-1741) => vec(0x0000FFFF)
       // 000000cc ccdddddd|11110aaa bbbbbb00
-      __m128i blend =
-          __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
+      __m128i blend = __lsx_vbitsel_v(lead, trail, lsx_splat_u32(0x0000FFFF));
 
       // Add magic number to finish the result
       // 110111CC CCDDDDDD|110110AA BBBBBBCC
@@ -59510,13 +60771,12 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // first.
     __m128i middlehigh = __lsx_vslli_w(perm, 2);
     // 00000000 00000000 00cccccc 00000000
-    __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+    __m128i middlebyte = __lsx_vand_v(perm, lsx_splat_u32(0x00003F00));
     // Start assembling the sequence. Since the 4th byte is in the same position
     // as it would be in a surrogate and there is no dependency, shift left
     // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
     // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
-    __m128i ab =
-        __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+    __m128i ab = __lsx_vbitsel_v(middlehigh, perm, lsx_splat_u32(0xFF000000));
     // Top 16 bits contains the high ten bits of the surrogate pair before
     // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
     // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
@@ -59530,8 +60790,7 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // After this is for surrogates
     // Blend the low and high surrogates
     // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
-    __m128i mixed =
-        __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+    __m128i mixed = __lsx_vbitsel_v(abc, composed, lsx_splat_u32(0x0000FFFF));
     // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
     // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
     // 11110aaa bbbbbbcc|000000cc ccdddddd
@@ -59696,14 +60955,13 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     // The top bits will be corrected later in the bsl
     // 00000000 10bbbbbb 00000000
     __m128i middle =
-        __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+        __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
     // Combine low and middle with shift right accumulate
     // 00000000 00xxbbbb bbcccccc
     __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
     // Insert top 4 bits from high byte with bitwise select
     // 00000000 aaaabbbb bbcccccc
-    __m128i composed =
-        __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+    __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
     __lsx_vst(composed, utf32_output, 0);
     utf32_output += 4; // We wrote 4 32-bit characters.
     return consumed;
@@ -59732,10 +60990,10 @@ size_t convert_masked_utf8_to_utf32(const char *input,
       __m128i merge2 =
           __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
                           __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
-                          __lsx_vldi(-2545));        /*0x00000FFF*/
+                          lsx_splat_u32(0x00000FFF));
       // Clear the garbage
       // 00000000 000aaabb bbbbcccc ccdddddd
-      __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+      __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
       // Store
       __lsx_vst(composed, utf32_output, 0);
       utf32_output += 3; // We wrote 3 32-bit characters.
@@ -59755,11 +61013,11 @@ size_t convert_masked_utf8_to_utf32(const char *input,
 
     // Ascii
     __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
-    __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+    __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
     // 00000000 00000000 0000cccc ccdddddd
     __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);
 
-    __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+    __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
     __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
     // Insert twice
     // 00000000 000aaabb bbbbxxxx xxxxxxxx
@@ -59769,8 +61027,7 @@ size_t convert_masked_utf8_to_utf32(const char *input,
         __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
     ab = __lsx_vsrli_w(ab, 4);
     // 00000000 000aaabb bbbbcccc ccdddddd
-    __m128i composed =
-        __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+    __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
     // Store
     __lsx_vst(composed, utf32_output, 0);
     utf32_output += 3; // We wrote 3 32-bit characters.
@@ -60021,7 +61278,7 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       // t0 = [000a|aaaa|bbbb|bb00]
       __m256i t0 = __lasx_xvslli_h(in, 2);
       // t1 = [000a|aaaa|0000|0000]
-      __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+      __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
       // t2 = [0000|0000|00bb|bbbb]
       __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
       // t3 = [000a|aaaa|00bb|bbbb]
@@ -60059,9 +61316,8 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       buf += 16;
       continue;
     }
-    __m256i surrogates_bytemask =
-        __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
-                       __lasx_xvldi(-2600 /*0xD800*/));
+    __m256i surrogates_bytemask = __lasx_xvseq_h(
+        __lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
     if (__lasx_xbz_v(surrogates_bytemask)) {
@@ -60101,14 +61357,14 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
       __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
+      __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
 
       // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
       __m256i s0 = __lasx_xvsrli_h(in, 12);
       // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
       __m256i s1 = __lasx_xvslli_h(in, 2);
       // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
-      s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+      s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
 
       // [00bb|bbbb|0000|aaaa]
       __m256i s2 = __lasx_xvor_v(s0, s1);
@@ -60116,8 +61372,8 @@ lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
       __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
       __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
       __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
-      __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
-                                   __lasx_xvldi(-2752 /*0x4000*/));
+      __m256i m0 =
+          __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
       __m256i s4 = __lasx_xvxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
@@ -60276,7 +61532,7 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       // t0 = [000a|aaaa|bbbb|bb00]
       __m256i t0 = __lasx_xvslli_h(in, 2);
       // t1 = [000a|aaaa|0000|0000]
-      __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+      __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
       // t2 = [0000|0000|00bb|bbbb]
       __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
       // t3 = [000a|aaaa|00bb|bbbb]
@@ -60314,9 +61570,8 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       buf += 16;
       continue;
     }
-    __m256i surrogates_bytemask =
-        __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
-                       __lasx_xvldi(-2600 /*0xD800*/));
+    __m256i surrogates_bytemask = __lasx_xvseq_h(
+        __lasx_xvand_v(in, lasx_splat_u16(0xf800)), lasx_splat_u16(0xd800));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
     if (__lasx_xbz_v(surrogates_bytemask)) {
@@ -60356,14 +61611,14 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
       __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
+      __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
 
       // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
       __m256i s0 = __lasx_xvsrli_h(in, 12);
       // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
       __m256i s1 = __lasx_xvslli_h(in, 2);
       // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
-      s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+      s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
 
       // [00bb|bbbb|0000|aaaa]
       __m256i s2 = __lasx_xvor_v(s0, s1);
@@ -60371,8 +61626,8 @@ lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
       __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
       __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
-      __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
-                                   __lasx_xvldi(-2752 /*0x4000*/));
+      __m256i m0 =
+          __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
       __m256i s4 = __lasx_xvxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
@@ -60524,8 +61779,8 @@ lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
     }
   }
 
-  __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
-  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_f800 = lasx_splat_u16(0xf800);
+  __m256i v_d800 = lasx_splat_u16(0xd800);
 
   while (end - buf >= 16) {
     __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
@@ -60623,8 +61878,8 @@ lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
     }
   }
 
-  __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
-  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_f800 = lasx_splat_u16(0xf800);
+  __m256i v_d800 = lasx_splat_u16(0xd800);
   while (end - buf >= 16) {
     __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
     if (!match_system(big_endian)) {
@@ -60795,10 +62050,10 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
     buf++;
   }
 
-  __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
-  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
-  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
-  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_c080 = lasx_splat_u16(0xc080);
+  __m256i v_07ff = lasx_splat_u16(0x07ff);
+  __m256i v_dfff = lasx_splat_u16(0xdfff);
+  __m256i v_d800 = lasx_splat_u16(0xd800);
   __m256i zero = __lasx_xvldi(0);
   __m128i zero_128 = __lsx_vldi(0);
   __m256i forbidden_bytemask = __lasx_xvldi(0x0);
@@ -60840,7 +62095,7 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
         // t0 = [000a|aaaa|bbbb|bb00]
         const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
         // t1 = [000a|aaaa|0000|0000]
-        const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+        const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
         // t2 = [0000|0000|00bb|bbbb]
         const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
         // t3 = [000a|aaaa|00bb|bbbb]
@@ -60920,14 +62175,14 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
         __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
         __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
+        __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
 
         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
         __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
         __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+        s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
         // [00bb|bbbb|0000|aaaa]
         __m256i s2 = __lasx_xvor_v(s0, s1);
         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
@@ -60936,8 +62191,8 @@ lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
         // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
         __m256i one_or_two_bytes_bytemask =
             __lasx_xvsle_hu(utf16_packed, v_07ff);
-        __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
-                                     __lasx_xvldi(-2752 /*0x4000*/));
+        __m256i m0 =
+            __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
         __m256i s4 = __lasx_xvxor_v(s3, m0);
 
         // 4. expand code units 16-bit => 32-bit
@@ -61091,10 +62346,10 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
     buf++;
   }
 
-  __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
-  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
-  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
-  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_c080 = lasx_splat_u16(0xc080);
+  __m256i v_07ff = lasx_splat_u16(0x07ff);
+  __m256i v_dfff = lasx_splat_u16(0xdfff);
+  __m256i v_d800 = lasx_splat_u16(0xd800);
   __m256i zero = __lasx_xvldi(0);
   __m128i zero_128 = __lsx_vldi(0);
   __m256i forbidden_bytemask = __lasx_xvldi(0x0);
@@ -61135,7 +62390,7 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         // t0 = [000a|aaaa|bbbb|bb00]
         const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
         // t1 = [000a|aaaa|0000|0000]
-        const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+        const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
         // t2 = [0000|0000|00bb|bbbb]
         const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
         // t3 = [000a|aaaa|00bb|bbbb]
@@ -61219,14 +62474,14 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
         __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
         // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
+        __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));
 
         // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
         __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
         // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
         __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
         // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+        s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3F00));
         // [00bb|bbbb|0000|aaaa]
         __m256i s2 = __lasx_xvor_v(s0, s1);
         // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
@@ -61235,8 +62490,8 @@ lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
         __m256i one_or_two_bytes_bytemask =
             __lasx_xvsle_hu(utf16_packed, v_07ff);
-        __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
-                                     __lasx_xvldi(-2752 /*0x4000*/));
+        __m256i m0 =
+            __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
         __m256i s4 = __lasx_xvxor_v(s3, m0);
 
         // 4. expand code units 16-bit => 32-bit
@@ -61396,8 +62651,8 @@ lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   }
 
   __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
-  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
-  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+  __m256i v_d800 = lasx_splat_u16(0xd800);
+  __m256i v_dfff = lasx_splat_u16(0xdfff);
   while (end - buf >= 16) {
     __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
     __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
@@ -61503,8 +62758,8 @@ lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
   }
 
   __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
-  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
-  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+  __m256i v_d800 = lasx_splat_u16(0xd800);
+  __m256i v_dfff = lasx_splat_u16(0xdfff);
   while (end - buf >= 16) {
     __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
     __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
@@ -61945,10 +63200,9 @@ static inline void load_block(block64 *b, const char16_t *src) {
 static inline void base64_decode(char *out, __m256i str) {
   __m256i t0 = __lasx_xvor_v(
       __lasx_xvslli_w(str, 26),
-      __lasx_xvslli_w(__lasx_xvand_v(str, __lasx_xvldi(-1758 /*0x0000FF00*/)),
-                      12));
-  __m256i t1 = __lasx_xvsrli_w(
-      __lasx_xvand_v(str, __lasx_xvldi(-3521 /*0x003F0000*/)), 2);
+      __lasx_xvslli_w(__lasx_xvand_v(str, lasx_splat_u32(0x0000ff00)), 12));
+  __m256i t1 =
+      __lasx_xvsrli_w(__lasx_xvand_v(str, lasx_splat_u32(0x003f0000)), 2);
   __m256i t2 = __lasx_xvor_v(t0, t1);
   __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16));
   __m256i pack_shuffle = ____m256i(
@@ -64028,7 +65282,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (!match_system(big_endian)) {
       input = input.swap_bytes();
     }
-
     // 0xd800 .. 0xdbff - low surrogate
     // 0xdc00 .. 0xdfff - high surrogate
     const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));
@@ -64071,7 +65324,6 @@ simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
     if (iteration == 0) {
       count += v_count.sum();
       v_count = vector_u16::zero();
-
       iteration = max_iterations;
     }
   }
@@ -64249,6 +65501,147 @@ const result validate_utf16_with_errors(const char16_t *input, size_t size) {
 /* end file src/generic/validate_utf16.h */
 #endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
 
+#if SIMDUTF_FEATURE_UTF32
+/* begin file src/generic/utf32.h */
+#include <limits>
+
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace utf32 {
+
+template <typename T> T min(T a, T b) { return a <= b ? a : b; }
+
+simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
+                                                    size_t length) {
+  using vector_u32 = simd32<uint32_t>;
+
+  const char32_t *start = input;
+
+  // we add up to three ones in a single iteration (see the vectorized loop in
+  // section #2 below)
+  const size_t max_increment = 3;
+
+  const size_t N = vector_u32::ELEMENTS;
+
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+  const auto v_0000007f = vector_u32::splat(0x0000007f);
+  const auto v_000007ff = vector_u32::splat(0x000007ff);
+  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
+#else
+  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
+  const auto v_fffff800 = vector_u32::splat(0xfffff800);
+  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
+  const auto one = vector_u32::splat(1);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+
+  size_t counter = 0;
+
+  // 1. vectorized loop unrolled 4 times
+  {
+    // we use vector of uint32 counters, this is why this limit is used
+    const size_t max_iterations =
+        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
+    size_t blocks = length / (N * 4);
+    length -= blocks * (N * 4);
+    while (blocks != 0) {
+      const size_t iterations = min(blocks, max_iterations);
+      blocks -= iterations;
+
+      simd32<uint32_t> acc = vector_u32::zero();
+      for (size_t i = 0; i < iterations; i++) {
+        const auto in0 = vector_u32(input + 0 * N);
+        const auto in1 = vector_u32(input + 1 * N);
+        const auto in2 = vector_u32(input + 2 * N);
+        const auto in3 = vector_u32(input + 3 * N);
+
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in0 > v_0000007f);
+        acc -= as_vector_u32(in1 > v_0000007f);
+        acc -= as_vector_u32(in2 > v_0000007f);
+        acc -= as_vector_u32(in3 > v_0000007f);
+
+        acc -= as_vector_u32(in0 > v_000007ff);
+        acc -= as_vector_u32(in1 > v_000007ff);
+        acc -= as_vector_u32(in2 > v_000007ff);
+        acc -= as_vector_u32(in3 > v_000007ff);
+
+        acc -= as_vector_u32(in0 > v_0000ffff);
+        acc -= as_vector_u32(in1 > v_0000ffff);
+        acc -= as_vector_u32(in2 > v_0000ffff);
+        acc -= as_vector_u32(in3 > v_0000ffff);
+#else
+        acc += min(one, in0 & v_ffffff80);
+        acc += min(one, in1 & v_ffffff80);
+        acc += min(one, in2 & v_ffffff80);
+        acc += min(one, in3 & v_ffffff80);
+
+        acc += min(one, in0 & v_fffff800);
+        acc += min(one, in1 & v_fffff800);
+        acc += min(one, in2 & v_fffff800);
+        acc += min(one, in3 & v_fffff800);
+
+        acc += min(one, in0 & v_ffff0000);
+        acc += min(one, in1 & v_ffff0000);
+        acc += min(one, in2 & v_ffff0000);
+        acc += min(one, in3 & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+
+        input += 4 * N;
+      }
+
+      counter += acc.sum();
+    }
+  }
+
+  // 2. vectorized loop for tail
+  {
+    const size_t max_iterations =
+        std::numeric_limits<uint32_t>::max() / max_increment;
+    size_t blocks = length / N;
+    length -= blocks * N;
+    while (blocks != 0) {
+      const size_t iterations = min(blocks, max_iterations);
+      blocks -= iterations;
+
+      auto acc = vector_u32::zero();
+      for (size_t i = 0; i < iterations; i++) {
+        const auto in = vector_u32(input);
+
+#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+        acc -= as_vector_u32(in > v_0000007f);
+        acc -= as_vector_u32(in > v_000007ff);
+        acc -= as_vector_u32(in > v_0000ffff);
+#else
+        acc += min(one, in & v_ffffff80);
+        acc += min(one, in & v_fffff800);
+        acc += min(one, in & v_ffff0000);
+#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP
+
+        input += N;
+      }
+
+      counter += acc.sum();
+    }
+  }
+
+  const size_t consumed = input - start;
+  if (consumed != 0) {
+    // We don't count 0th bytes in the vectorized loops above, this
+    // is why we need to count them in the end.
+    counter += consumed;
+  }
+
+  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+} // namespace utf32
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
+/* end file src/generic/utf32.h */
+#endif // SIMDUTF_FEATURE_UTF32
+
 //
 // Implementation-specific overrides
 //
@@ -65308,45 +66701,14 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
 #if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  __m256i v_80 = __lasx_xvrepli_w(0x80); /*0x00000080*/
-  __m256i v_800 = __lasx_xvldi(-3832);   /*0x00000800*/
-  __m256i v_10000 = __lasx_xvldi(-3583); /*0x00010000*/
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 8 <= length; pos += 8) {
-    __m256i in =
-        __lasx_xvld(reinterpret_cast<const uint32_t *>(input + pos), 0);
-    __m256i ascii_bytes_bytemask = __lasx_xvslt_w(in, v_80);
-    __m256i one_two_bytes_bytemask = __lasx_xvslt_w(in, v_800);
-    __m256i two_bytes_bytemask =
-        __lasx_xvxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    __m256i three_bytes_bytemask =
-        __lasx_xvxor_v(__lasx_xvslt_w(in, v_10000), one_two_bytes_bytemask);
-
-    __m256i ascii_bytes =
-        __lasx_xvpcnt_w(__lasx_xvmskltz_w(ascii_bytes_bytemask));
-    const uint32_t ascii_bytes_count = __lasx_xvpickve2gr_wu(ascii_bytes, 0) +
-                                       __lasx_xvpickve2gr_wu(ascii_bytes, 4);
-    __m256i two_bytes = __lasx_xvpcnt_w(__lasx_xvmskltz_w(two_bytes_bytemask));
-    const uint32_t two_bytes_count = __lasx_xvpickve2gr_wu(two_bytes, 0) +
-                                     __lasx_xvpickve2gr_wu(two_bytes, 4);
-    __m256i three_bytes =
-        __lasx_xvpcnt_w(__lasx_xvmskltz_w(three_bytes_bytemask));
-    const uint32_t three_bytes_count = __lasx_xvpickve2gr_wu(three_bytes, 0) +
-                                       __lasx_xvpickve2gr_wu(three_bytes, 4);
-
-    count +=
-        32 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
-  }
-  return count +
-         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+  return utf32::utf8_length_from_utf32(input, length);
 }
 #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
 
 #if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
+  __m128i v_ffff = lsx_splat_u32(0x0000ffff);
   size_t pos = 0;
   size_t count = 0;
   for (; pos + 4 <= length; pos += 4) {
@@ -65471,6 +66833,7 @@ size_t implementation::binary_to_base64(const char *input, size_t length,
 } // namespace simdutf
 
 /* begin file src/simdutf/lasx/end.h */
+#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
 /* end file src/simdutf/lasx/end.h */
 /* end file src/lasx/implementation.cpp */
 #endif
diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h
index cd92c8ea04955d..02ee47d388e129 100644
--- a/deps/simdutf/simdutf.h
+++ b/deps/simdutf/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2025-03-17 16:12:36 -0400. Do not edit! */
+/* auto-generated on 2025-04-03 18:47:20 -0400. Do not edit! */
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
@@ -652,7 +652,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "6.4.0"
+#define SIMDUTF_VERSION "6.4.2"
 
 namespace simdutf {
 enum {
@@ -667,7 +667,7 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 0
+  SIMDUTF_VERSION_REVISION = 2
 };
 } // namespace simdutf
 
@@ -3966,7 +3966,7 @@ atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
  * characters) must be divisible by four.
  *
  * You should call this function with a buffer that is at least
- * maximal_binary_length_from_utf6_base64(input, length) bytes long. If you fail
+ * maximal_binary_length_from_base64(input, length) bytes long. If you fail
  * to provide that much space, the function may cause a buffer overflow.
  *
  * Advanced users may want to taylor how the last chunk is handled. By default,
@@ -5715,7 +5715,7 @@ class implementation {
    * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
    *
    * You should call this function with a buffer that is at least
-   * maximal_binary_length_from_utf6_base64(input, length) bytes long. If you
+   * maximal_binary_length_from_base64(input, length) bytes long. If you
    * fail to provide that much space, the function may cause a buffer overflow.
    *
    * @param input         the base64 string to process, in ASCII stored as