From 3492364eeac70aee126243e55803cede3b4a2e55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Thu, 16 Nov 2023 16:36:09 +0100 Subject: [PATCH 1/8] Add support for masked loads & stores --- crates/core_simd/src/intrinsics.rs | 9 +++++ crates/core_simd/src/vector.rs | 59 ++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs index b27893bc729..1cbbc6708d5 100644 --- a/crates/core_simd/src/intrinsics.rs +++ b/crates/core_simd/src/intrinsics.rs @@ -107,6 +107,15 @@ extern "platform-intrinsic" { /// like gather, but more spicy, as it writes instead of reads pub(crate) fn simd_scatter(val: T, ptr: U, mask: V); + /// like a loop of reads offset from the same pointer + /// val: vector of values to select if a lane is masked + /// ptr: vector of pointers to read from + /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val) + /// note, the LLVM intrinsic accepts a mask vector of `` + pub(crate) fn simd_masked_load(val: T, ptr: U, mask: V) -> T; + /// like masked_load, but more spicy, as it writes instead of reads + pub(crate) fn simd_masked_store(val: T, ptr: U, mask: V); + // {s,u}add.sat pub(crate) fn simd_saturating_add(x: T, y: T) -> T; diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index 18a0bb0a77e..cd67e63fe3a 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -311,6 +311,42 @@ where unsafe { self.store(slice.as_mut_ptr().cast()) } } + #[must_use] + #[inline] + pub fn masked_load_or(slice: &[T], or: Self) -> Self { + Self::masked_load_select(slice, Mask::splat(true), or) + } + + #[must_use] + #[inline] + pub fn masked_load_select(slice: &[T], enable: Mask, or: Self) -> Self { + let ptr = slice.as_ptr(); + let idxs = Simd::::from_slice(&[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]); + let enable: Mask = enable & idxs.simd_lt(Simd::splat(slice.len())); + unsafe { Self::masked_load_select_ptr(ptr, enable, or) } + } + + #[must_use] + #[inline] + pub unsafe fn masked_load_select_unchecked( + slice: &[T], + enable: Mask, + or: Self, + ) -> Self { + let ptr = slice.as_ptr(); + unsafe { Self::masked_load_select_ptr(ptr, enable, or) } + } + + #[must_use] + #[inline] + pub unsafe fn masked_load_select_ptr(ptr: *const T, enable: Mask, or: Self) -> Self { + unsafe { intrinsics::simd_masked_load(or, ptr, enable.to_int()) } + } + /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector. /// If an index is out-of-bounds, the element is instead selected from the `or` vector. /// @@ -489,6 +525,29 @@ where unsafe { intrinsics::simd_gather(or, source, enable.to_int()) } } + #[inline] + pub fn masked_store(self, slice: &mut [T], enable: Mask) { + let ptr = slice.as_mut_ptr(); + let idxs = Simd::::from_slice(&[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]); + let enable: Mask = enable & idxs.simd_lt(Simd::splat(slice.len())); + unsafe { self.masked_store_ptr(ptr, enable) } + } + + #[inline] + pub unsafe fn masked_store_unchecked(self, slice: &mut [T], enable: Mask) { + let ptr = slice.as_mut_ptr(); + unsafe { self.masked_store_ptr(ptr, enable) } + } + + #[inline] + pub unsafe fn masked_store_ptr(self, ptr: *mut T, enable: Mask) { + unsafe { intrinsics::simd_masked_store(self, ptr, enable.to_int()) } + } + /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`. /// If an index is out-of-bounds, the write is suppressed without panicking. /// If two elements in the scattered vector would write to the same index From f182fa7eab0aa6e724052ae0608de0ee6e308b37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Thu, 16 Nov 2023 16:40:29 +0100 Subject: [PATCH 2/8] Add tests for masked loads&stores --- crates/core_simd/tests/masked_load_store.rs | 35 +++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 crates/core_simd/tests/masked_load_store.rs diff --git a/crates/core_simd/tests/masked_load_store.rs b/crates/core_simd/tests/masked_load_store.rs new file mode 100644 index 00000000000..374b5c3b728 --- /dev/null +++ b/crates/core_simd/tests/masked_load_store.rs @@ -0,0 +1,35 @@ +#![feature(portable_simd)] +use core_simd::simd::prelude::*; + +#[cfg(target_arch = "wasm32")] +use wasm_bindgen_test::*; + +#[cfg(target_arch = "wasm32")] +wasm_bindgen_test_configure!(run_in_browser); + +#[test] +#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] +fn masked_load_store() { + let mut arr = [u8::MAX; 7]; + + u8x4::splat(0).masked_store(&mut arr[5..], Mask::from_array([false, true, false, true])); + // write to index 8 is OOB and dropped + assert_eq!(arr, [255u8, 255, 255, 255, 255, 255, 0]); + + u8x4::from_array([0, 1, 2, 3]).masked_store(&mut arr[1..], Mask::splat(true)); + assert_eq!(arr, [255u8, 0, 1, 2, 3, 255, 0]); + + // read from index 8 is OOB and dropped + assert_eq!( + u8x4::masked_load_or(&arr[4..], u8x4::splat(42)), + u8x4::from_array([3, 255, 0, 42]) + ); + assert_eq!( + u8x4::masked_load_select( + &arr[4..], + Mask::from_array([true, false, true, true]), + u8x4::splat(42) + ), + u8x4::from_array([3, 42, 0, 42]) + ); +} From 66a57485c6f5dfdfd804d2c058cdbd84082926d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Fri, 17 Nov 2023 19:13:51 +0100 Subject: [PATCH 3/8] Optimize bounds checking --- crates/core_simd/src/vector.rs | 81 ++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index cd67e63fe3a..bcd4ddcf69d 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -1,3 +1,4 @@ +use super::masks::{ToBitMask, ToBitMaskArray}; use crate::simd::{ cmp::SimdPartialOrd, intrinsics, @@ -313,28 +314,39 @@ where #[must_use] #[inline] - pub fn masked_load_or(slice: &[T], or: Self) -> Self { + pub fn masked_load_or(slice: &[T], or: Self) -> Self + where + Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + { Self::masked_load_select(slice, Mask::splat(true), or) } #[must_use] #[inline] - pub fn masked_load_select(slice: &[T], enable: Mask, or: Self) -> Self { - let ptr = slice.as_ptr(); - let idxs = Simd::::from_slice(&[ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, - ]); - let enable: Mask = enable & idxs.simd_lt(Simd::splat(slice.len())); - unsafe { Self::masked_load_select_ptr(ptr, enable, or) } + pub fn masked_load_select( + slice: &[T], + mut enable: Mask<::Mask, N>, + or: Self, + ) -> Self + where + Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + { + enable &= { + let mask = bzhi_u64(u64::MAX, core::cmp::min(N, slice.len()) as u32); + let mask_bytes: [u8; 8] = unsafe { core::mem::transmute(mask) }; + let mut in_bounds_arr = Mask::splat(true).to_bitmask_array(); + let len = in_bounds_arr.as_ref().len(); + in_bounds_arr.as_mut().copy_from_slice(&mask_bytes[..len]); + Mask::from_bitmask_array(in_bounds_arr) + }; + unsafe { Self::masked_load_select_ptr(slice.as_ptr(), enable, or) } } #[must_use] #[inline] pub unsafe fn masked_load_select_unchecked( slice: &[T], - enable: Mask, + enable: Mask<::Mask, N>, or: Self, ) -> Self { let ptr = slice.as_ptr(); @@ -343,7 +355,11 @@ where #[must_use] #[inline] - pub unsafe fn masked_load_select_ptr(ptr: *const T, enable: Mask, or: Self) -> Self { + pub unsafe fn masked_load_select_ptr( + ptr: *const T, + enable: Mask<::Mask, N>, + or: Self, + ) -> Self { unsafe { intrinsics::simd_masked_load(or, ptr, enable.to_int()) } } @@ -526,25 +542,33 @@ where } #[inline] - pub fn masked_store(self, slice: &mut [T], enable: Mask) { - let ptr = slice.as_mut_ptr(); - let idxs = Simd::::from_slice(&[ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, - ]); - let enable: Mask = enable & idxs.simd_lt(Simd::splat(slice.len())); - unsafe { self.masked_store_ptr(ptr, enable) } + pub fn masked_store(self, slice: &mut [T], mut enable: Mask<::Mask, N>) + where + Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + { + enable &= { + let mask = bzhi_u64(u64::MAX, core::cmp::min(N, slice.len()) as u32); + let mask_bytes: [u8; 8] = unsafe { core::mem::transmute(mask) }; + let mut in_bounds_arr = Mask::splat(true).to_bitmask_array(); + let len = in_bounds_arr.as_ref().len(); + in_bounds_arr.as_mut().copy_from_slice(&mask_bytes[..len]); + Mask::from_bitmask_array(in_bounds_arr) + }; + unsafe { self.masked_store_ptr(slice.as_mut_ptr(), enable) } } #[inline] - pub unsafe fn masked_store_unchecked(self, slice: &mut [T], enable: Mask) { + pub unsafe fn masked_store_unchecked( + self, + slice: &mut [T], + enable: Mask<::Mask, N>, + ) { let ptr = slice.as_mut_ptr(); unsafe { self.masked_store_ptr(ptr, enable) } } #[inline] - pub unsafe fn masked_store_ptr(self, ptr: *mut T, enable: Mask) { + pub unsafe fn masked_store_ptr(self, ptr: *mut T, enable: Mask<::Mask, N>) { unsafe { intrinsics::simd_masked_store(self, ptr, enable.to_int()) } } @@ -1033,3 +1057,14 @@ where { type Mask = isize; } + +// This function matches the semantics of the `bzhi` instruction on x86 BMI2 +// TODO: optimize it further if possible +// https://stackoverflow.com/questions/75179720/how-to-get-rust-compiler-to-emit-bzhi-instruction-without-resorting-to-platform +fn bzhi_u64(a: u64, ix: u32) -> u64 { + if ix > 63 { + a + } else { + a & (1u64 << ix) - 1 + } +} From 34e54b424e733af6faf2585bc2fad7e47b69fb9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Sun, 19 Nov 2023 14:37:49 +0100 Subject: [PATCH 4/8] Rework the masking logic, rename the functions --- crates/core_simd/src/lib.rs | 1 + crates/core_simd/src/vector.rs | 122 +++++++++++++++++--- crates/core_simd/tests/masked_load_store.rs | 4 +- 3 files changed, 112 insertions(+), 15 deletions(-) diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs index 64ba9705ef5..e974e7aa25a 100644 --- a/crates/core_simd/src/lib.rs +++ b/crates/core_simd/src/lib.rs @@ -4,6 +4,7 @@ const_maybe_uninit_as_mut_ptr, const_mut_refs, convert_float_to_int, + core_intrinsics, decl_macro, inline_const, intra_doc_pointers, diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index bcd4ddcf69d..e48b8931db6 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -2,6 +2,7 @@ use super::masks::{ToBitMask, ToBitMaskArray}; use crate::simd::{ cmp::SimdPartialOrd, intrinsics, + prelude::SimdPartialEq, ptr::{SimdConstPtr, SimdMutPtr}, LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle, }; @@ -314,48 +315,95 @@ where #[must_use] #[inline] - pub fn masked_load_or(slice: &[T], or: Self) -> Self + pub fn load_or_default(slice: &[T]) -> Self where Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + T: Default, + ::Mask: Default + + core::convert::From + + core::ops::Add<::Mask, Output = ::Mask>, + Simd<::Mask, N>: SimdPartialOrd, + Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> + + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, { - Self::masked_load_select(slice, Mask::splat(true), or) + Self::load_or(slice, Default::default()) } #[must_use] #[inline] - pub fn masked_load_select( - slice: &[T], - mut enable: Mask<::Mask, N>, - or: Self, - ) -> Self + pub fn load_or(slice: &[T], or: Self) -> Self where Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + ::Mask: Default + + core::convert::From + + core::ops::Add<::Mask, Output = ::Mask>, + Simd<::Mask, N>: SimdPartialOrd, + Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> + + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, { - enable &= { + Self::load_select(slice, Mask::splat(true), or) + } + + #[must_use] + #[inline] + pub fn load_select_or_default(slice: &[T], enable: Mask<::Mask, N>) -> Self + where + Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + T: Default, + ::Mask: Default + + core::convert::From + + core::ops::Add<::Mask, Output = ::Mask>, + Simd<::Mask, N>: SimdPartialOrd, + Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> + + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + { + Self::load_select(slice, enable, Default::default()) + } + + #[must_use] + #[inline] + pub fn load_select(slice: &[T], mut enable: Mask<::Mask, N>, or: Self) -> Self + where + Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + ::Mask: Default + + core::convert::From + + core::ops::Add<::Mask, Output = ::Mask>, + Simd<::Mask, N>: SimdPartialOrd, + Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> + + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + { + if USE_BRANCH { + if core::intrinsics::likely(enable.all() && slice.len() > N) { + return Self::from_slice(slice); + } + } + enable &= if USE_BITMASK { let mask = bzhi_u64(u64::MAX, core::cmp::min(N, slice.len()) as u32); let mask_bytes: [u8; 8] = unsafe { core::mem::transmute(mask) }; let mut in_bounds_arr = Mask::splat(true).to_bitmask_array(); let len = in_bounds_arr.as_ref().len(); in_bounds_arr.as_mut().copy_from_slice(&mask_bytes[..len]); Mask::from_bitmask_array(in_bounds_arr) + } else { + mask_up_to(enable, slice.len()) }; - unsafe { Self::masked_load_select_ptr(slice.as_ptr(), enable, or) } + unsafe { Self::load_select_ptr(slice.as_ptr(), enable, or) } } #[must_use] #[inline] - pub unsafe fn masked_load_select_unchecked( + pub unsafe fn load_select_unchecked( slice: &[T], enable: Mask<::Mask, N>, or: Self, ) -> Self { let ptr = slice.as_ptr(); - unsafe { Self::masked_load_select_ptr(ptr, enable, or) } + unsafe { Self::load_select_ptr(ptr, enable, or) } } #[must_use] #[inline] - pub unsafe fn masked_load_select_ptr( + pub unsafe fn load_select_ptr( ptr: *const T, enable: Mask<::Mask, N>, or: Self, @@ -545,14 +593,28 @@ where pub fn masked_store(self, slice: &mut [T], mut enable: Mask<::Mask, N>) where Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + Mask<::Mask, N>: ToBitMask + ToBitMaskArray, + ::Mask: Default + + core::convert::From + + core::ops::Add<::Mask, Output = ::Mask>, + Simd<::Mask, N>: SimdPartialOrd, + Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> + + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, { - enable &= { + if USE_BRANCH { + if core::intrinsics::likely(enable.all() && slice.len() > N) { + return self.copy_to_slice(slice); + } + } + enable &= if USE_BITMASK { let mask = bzhi_u64(u64::MAX, core::cmp::min(N, slice.len()) as u32); let mask_bytes: [u8; 8] = unsafe { core::mem::transmute(mask) }; let mut in_bounds_arr = Mask::splat(true).to_bitmask_array(); let len = in_bounds_arr.as_ref().len(); in_bounds_arr.as_mut().copy_from_slice(&mask_bytes[..len]); Mask::from_bitmask_array(in_bounds_arr) + } else { + mask_up_to(enable, slice.len()) }; unsafe { self.masked_store_ptr(slice.as_mut_ptr(), enable) } } @@ -1058,9 +1120,43 @@ where type Mask = isize; } +const USE_BRANCH: bool = false; +const USE_BITMASK: bool = false; + +#[inline] +fn index() -> Simd +where + T: MaskElement + Default + core::convert::From + core::ops::Add, + LaneCount: SupportedLaneCount, +{ + let mut index = [T::default(); N]; + for i in 1..N { + index[i] = index[i - 1] + T::from(1); + } + Simd::from_array(index) +} + +#[inline] +fn mask_up_to(enable: Mask, len: usize) -> Mask +where + LaneCount: SupportedLaneCount, + M: MaskElement + Default + core::convert::From + core::ops::Add, + Simd: SimdPartialOrd, + // as SimdPartialEq>::Mask: Mask, + Mask: core::ops::BitAnd> + + core::convert::From< as SimdPartialEq>::Mask>, +{ + let index = index::(); + enable + & Mask::::from( + index.simd_lt(Simd::splat(M::from(i8::try_from(len).unwrap_or(i8::MAX)))), + ) +} + // This function matches the semantics of the `bzhi` instruction on x86 BMI2 // TODO: optimize it further if possible // https://stackoverflow.com/questions/75179720/how-to-get-rust-compiler-to-emit-bzhi-instruction-without-resorting-to-platform +#[inline(always)] fn bzhi_u64(a: u64, ix: u32) -> u64 { if ix > 63 { a diff --git a/crates/core_simd/tests/masked_load_store.rs b/crates/core_simd/tests/masked_load_store.rs index 374b5c3b728..e830330249c 100644 --- a/crates/core_simd/tests/masked_load_store.rs +++ b/crates/core_simd/tests/masked_load_store.rs @@ -21,11 +21,11 @@ fn masked_load_store() { // read from index 8 is OOB and dropped assert_eq!( - u8x4::masked_load_or(&arr[4..], u8x4::splat(42)), + u8x4::load_or(&arr[4..], u8x4::splat(42)), u8x4::from_array([3, 255, 0, 42]) ); assert_eq!( - u8x4::masked_load_select( + u8x4::load_select( &arr[4..], Mask::from_array([true, false, true, true]), u8x4::splat(42) From f0b4f2648d9d40b338857976fb86153d11dd8e3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Sun, 19 Nov 2023 15:37:29 +0100 Subject: [PATCH 5/8] Optimize masking by always calculating it in i8 space --- crates/core_simd/src/vector.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index e48b8931db6..715aac9a194 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -324,7 +324,7 @@ where + core::ops::Add<::Mask, Output = ::Mask>, Simd<::Mask, N>: SimdPartialOrd, Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> - + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + + core::convert::From>, { Self::load_or(slice, Default::default()) } @@ -339,7 +339,7 @@ where + core::ops::Add<::Mask, Output = ::Mask>, Simd<::Mask, N>: SimdPartialOrd, Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> - + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + + core::convert::From>, { Self::load_select(slice, Mask::splat(true), or) } @@ -355,7 +355,7 @@ where + core::ops::Add<::Mask, Output = ::Mask>, Simd<::Mask, N>: SimdPartialOrd, Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> - + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + + core::convert::From>, { Self::load_select(slice, enable, Default::default()) } @@ -370,7 +370,7 @@ where + core::ops::Add<::Mask, Output = ::Mask>, Simd<::Mask, N>: SimdPartialOrd, Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> - + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + + core::convert::From>, { if USE_BRANCH { if core::intrinsics::likely(enable.all() && slice.len() > N) { @@ -599,7 +599,7 @@ where + core::ops::Add<::Mask, Output = ::Mask>, Simd<::Mask, N>: SimdPartialOrd, Mask<::Mask, N>: core::ops::BitAnd::Mask, N>> - + core::convert::From<::Mask, N> as SimdPartialEq>::Mask>, + + core::convert::From>, { if USE_BRANCH { if core::intrinsics::likely(enable.all() && slice.len() > N) { @@ -1143,14 +1143,10 @@ where M: MaskElement + Default + core::convert::From + core::ops::Add, Simd: SimdPartialOrd, // as SimdPartialEq>::Mask: Mask, - Mask: core::ops::BitAnd> - + core::convert::From< as SimdPartialEq>::Mask>, + Mask: core::ops::BitAnd> + core::convert::From>, { - let index = index::(); - enable - & Mask::::from( - index.simd_lt(Simd::splat(M::from(i8::try_from(len).unwrap_or(i8::MAX)))), - ) + let index = index::(); + enable & Mask::::from(index.simd_lt(Simd::splat(i8::try_from(len).unwrap_or(i8::MAX)))) } // This function matches the semantics of the `bzhi` instruction on x86 BMI2 From 3ed9d1594a43081384dd38003ea83220cac01db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Tue, 5 Dec 2023 14:53:20 +0100 Subject: [PATCH 6/8] set USE_BRANCH=true --- crates/core_simd/src/vector.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index 715aac9a194..09ad68f1b01 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -1120,7 +1120,7 @@ where type Mask = isize; } -const USE_BRANCH: bool = false; +const USE_BRANCH: bool = true; const USE_BITMASK: bool = false; #[inline] From 470e711fd37a6ba9de05202f3738dbac81fc87c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Mon, 11 Dec 2023 11:31:59 +0100 Subject: [PATCH 7/8] Update intrinsics to use the version shipped in nightly --- crates/core_simd/src/intrinsics.rs | 4 ++-- crates/core_simd/src/vector.rs | 4 ++-- crates/core_simd/tests/pointers.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs index 1cbbc6708d5..5c688c11991 100644 --- a/crates/core_simd/src/intrinsics.rs +++ b/crates/core_simd/src/intrinsics.rs @@ -112,9 +112,9 @@ extern "platform-intrinsic" { /// ptr: vector of pointers to read from /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val) /// note, the LLVM intrinsic accepts a mask vector of `` - pub(crate) fn simd_masked_load(val: T, ptr: U, mask: V) -> T; + pub(crate) fn simd_masked_load(mask: V, ptr: U, val: T) -> T; /// like masked_load, but more spicy, as it writes instead of reads - pub(crate) fn simd_masked_store(val: T, ptr: U, mask: V); + pub(crate) fn simd_masked_store(mask: V, ptr: U, val: T); // {s,u}add.sat pub(crate) fn simd_saturating_add(x: T, y: T) -> T; diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index 09ad68f1b01..b7500c83ce8 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -408,7 +408,7 @@ where enable: Mask<::Mask, N>, or: Self, ) -> Self { - unsafe { intrinsics::simd_masked_load(or, ptr, enable.to_int()) } + unsafe { intrinsics::simd_masked_load(enable.to_int(), ptr, or) } } /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector. @@ -631,7 +631,7 @@ where #[inline] pub unsafe fn masked_store_ptr(self, ptr: *mut T, enable: Mask<::Mask, N>) { - unsafe { intrinsics::simd_masked_store(self, ptr, enable.to_int()) } + unsafe { intrinsics::simd_masked_store(enable.to_int(), ptr, self) } } /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`. diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs index a90ff928ced..b9f32d16e01 100644 --- a/crates/core_simd/tests/pointers.rs +++ b/crates/core_simd/tests/pointers.rs @@ -1,4 +1,4 @@ -#![feature(portable_simd, strict_provenance)] +#![feature(portable_simd, strict_provenance, exposed_provenance)] use core_simd::simd::{ ptr::{SimdConstPtr, SimdMutPtr}, From a4ae456ed250d7df0c5ae9d32539ceb78b6da57e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Oko=C5=84ski?= Date: Mon, 11 Dec 2023 11:37:23 +0100 Subject: [PATCH 8/8] Fix imports --- crates/core_simd/src/vector.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs index b7500c83ce8..46c6bbc88b2 100644 --- a/crates/core_simd/src/vector.rs +++ b/crates/core_simd/src/vector.rs @@ -1,8 +1,7 @@ -use super::masks::{ToBitMask, ToBitMaskArray}; use crate::simd::{ cmp::SimdPartialOrd, intrinsics, - prelude::SimdPartialEq, + ToBitMask, ToBitMaskArray, ptr::{SimdConstPtr, SimdMutPtr}, LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle, };