From b3eb737eb167779fce64d11e8ee760221e5cae1a Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Sun, 18 Mar 2018 11:07:44 +0100 Subject: [PATCH] add arm neon vector types --- ci/run.sh | 10 + coresimd/aarch64/mod.rs | 2 - coresimd/aarch64/neon.rs | 329 +++++++------ coresimd/aarch64/v8.rs | 9 +- coresimd/arm/mod.rs | 11 +- coresimd/arm/neon.rs | 737 ++++++++++++++++++++++-------- coresimd/arm/v7.rs | 27 +- coresimd/macros.rs | 23 + coresimd/mod.rs | 3 + coresimd/ppsv/mod.rs | 5 +- coresimd/ppsv/v128.rs | 121 +++-- coresimd/ppsv/v256.rs | 57 +-- coresimd/ppsv/v64.rs | 108 ++++- coresimd/x86/mod.rs | 31 +- crates/coresimd/tests/v128.rs | 5 +- crates/coresimd/tests/v16.rs | 5 +- crates/coresimd/tests/v256.rs | 5 +- crates/coresimd/tests/v32.rs | 5 +- crates/coresimd/tests/v512.rs | 5 +- crates/coresimd/tests/v64.rs | 5 +- crates/simd-test-macro/src/lib.rs | 2 +- examples/nbody.rs | 10 +- stdsimd/arch/detect/arch/arm.rs | 4 + 23 files changed, 1046 insertions(+), 473 deletions(-) create mode 100644 coresimd/macros.rs diff --git a/ci/run.sh b/ci/run.sh index 1ca1180d18..a27eaa463b 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -12,6 +12,16 @@ export RUST_TEST_THREADS=1 FEATURES="strict,$FEATURES" +# FIXME: on armv7 neon intrinsics require the neon target-feature to be +# unconditionally enabled. +case ${TARGET} in + armv7*) + export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+neon" + ;; + *) + ;; +esac + echo "RUSTFLAGS=${RUSTFLAGS}" echo "FEATURES=${FEATURES}" echo "OBJDUMP=${OBJDUMP}" diff --git a/coresimd/aarch64/mod.rs b/coresimd/aarch64/mod.rs index f8e0fbd40c..5c794e3750 100644 --- a/coresimd/aarch64/mod.rs +++ b/coresimd/aarch64/mod.rs @@ -11,7 +11,5 @@ mod v8; pub use self::v8::*; -#[cfg(target_feature = "neon")] mod neon; -#[cfg(target_feature = "neon")] pub use self::neon::*; diff --git a/coresimd/aarch64/neon.rs b/coresimd/aarch64/neon.rs index d4f3032e51..2b272c7427 100644 --- a/coresimd/aarch64/neon.rs +++ b/coresimd/aarch64/neon.rs @@ -6,20 +6,51 @@ use stdsimd_test::assert_instr; use coresimd::simd_llvm::simd_add; use coresimd::simd::*; +use coresimd::arm::*; + +types! { + /// ARM-specific 64-bit wide vector of one packed `f64`. + pub struct float64x1_t(f64); // FIXME: check this! + /// ARM-specific 128-bit wide vector of two packed `f64`. + pub struct float64x2_t(f64, f64); +} +impl_from_bits_!( + float64x1_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + float64x2_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); /// Vector add. #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fadd))] -pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 { - a + b +pub unsafe fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t { + simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fadd))] -pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 { +pub unsafe fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t { simd_add(a, b) } @@ -27,85 +58,85 @@ pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 { - a + b +pub unsafe fn vaddd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { + simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 { - a + b +pub unsafe fn vaddd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { + simd_add(a, b) } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"] - fn vmaxv_s8_(a: i8x8) -> i8; + fn vmaxv_s8_(a: int8x8_t) -> i8; #[link_name = "llvm.aarch64.neon.smaxv.i8.6i8"] - fn vmaxvq_s8_(a: i8x16) -> i8; + fn vmaxvq_s8_(a: int8x16_t) -> i8; #[link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"] - fn vmaxv_s16_(a: i16x4) -> i16; + fn vmaxv_s16_(a: int16x4_t) -> i16; #[link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"] - fn vmaxvq_s16_(a: i16x8) -> i16; + fn vmaxvq_s16_(a: int16x8_t) -> i16; #[link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"] - fn vmaxv_s32_(a: i32x2) -> i32; + fn vmaxv_s32_(a: int32x2_t) -> i32; #[link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"] - fn vmaxvq_s32_(a: i32x4) -> i32; + fn vmaxvq_s32_(a: int32x4_t) -> i32; #[link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"] - fn vmaxv_u8_(a: u8x8) -> u8; + fn vmaxv_u8_(a: uint8x8_t) -> u8; #[link_name = "llvm.aarch64.neon.umaxv.i8.6i8"] - fn vmaxvq_u8_(a: u8x16) -> u8; + fn vmaxvq_u8_(a: uint8x16_t) -> u8; #[link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"] - fn vmaxv_u16_(a: u16x4) -> u16; + fn vmaxv_u16_(a: uint16x4_t) -> u16; #[link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"] - fn vmaxvq_u16_(a: u16x8) -> u16; + fn vmaxvq_u16_(a: uint16x8_t) -> u16; #[link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"] - fn vmaxv_u32_(a: u32x2) -> u32; + fn vmaxv_u32_(a: uint32x2_t) -> u32; #[link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"] - fn vmaxvq_u32_(a: u32x4) -> u32; + fn vmaxvq_u32_(a: uint32x4_t) -> u32; #[link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32"] - fn vmaxv_f32_(a: f32x2) -> f32; + fn vmaxv_f32_(a: float32x2_t) -> f32; #[link_name = "llvm.aarch64.neon.fmaxv.f32.v4f32"] - fn vmaxvq_f32_(a: f32x4) -> f32; + fn vmaxvq_f32_(a: float32x4_t) -> f32; #[link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64"] - fn vmaxvq_f64_(a: f64x2) -> f64; + fn vmaxvq_f64_(a: float64x2_t) -> f64; #[link_name = "llvm.aarch64.neon.sminv.i8.v8i8"] - fn vminv_s8_(a: i8x8) -> i8; + fn vminv_s8_(a: int8x8_t) -> i8; #[link_name = "llvm.aarch64.neon.sminv.i8.6i8"] - fn vminvq_s8_(a: i8x16) -> i8; + fn vminvq_s8_(a: int8x16_t) -> i8; #[link_name = "llvm.aarch64.neon.sminv.i16.v4i16"] - fn vminv_s16_(a: i16x4) -> i16; + fn vminv_s16_(a: int16x4_t) -> i16; #[link_name = "llvm.aarch64.neon.sminv.i16.v8i16"] - fn vminvq_s16_(a: i16x8) -> i16; + fn vminvq_s16_(a: int16x8_t) -> i16; #[link_name = "llvm.aarch64.neon.sminv.i32.v2i32"] - fn vminv_s32_(a: i32x2) -> i32; + fn vminv_s32_(a: int32x2_t) -> i32; #[link_name = "llvm.aarch64.neon.sminv.i32.v4i32"] - fn vminvq_s32_(a: i32x4) -> i32; + fn vminvq_s32_(a: int32x4_t) -> i32; #[link_name = "llvm.aarch64.neon.uminv.i8.v8i8"] - fn vminv_u8_(a: u8x8) -> u8; + fn vminv_u8_(a: uint8x8_t) -> u8; #[link_name = "llvm.aarch64.neon.uminv.i8.6i8"] - fn vminvq_u8_(a: u8x16) -> u8; + fn vminvq_u8_(a: uint8x16_t) -> u8; #[link_name = "llvm.aarch64.neon.uminv.i16.v4i16"] - fn vminv_u16_(a: u16x4) -> u16; + fn vminv_u16_(a: uint16x4_t) -> u16; #[link_name = "llvm.aarch64.neon.uminv.i16.v8i16"] - fn vminvq_u16_(a: u16x8) -> u16; + fn vminvq_u16_(a: uint16x8_t) -> u16; #[link_name = "llvm.aarch64.neon.uminv.i32.v2i32"] - fn vminv_u32_(a: u32x2) -> u32; + fn vminv_u32_(a: uint32x2_t) -> u32; #[link_name = "llvm.aarch64.neon.uminv.i32.v4i32"] - fn vminvq_u32_(a: u32x4) -> u32; + fn vminvq_u32_(a: uint32x4_t) -> u32; #[link_name = "llvm.aarch64.neon.fminv.f32.v2f32"] - fn vminv_f32_(a: f32x2) -> f32; + fn vminv_f32_(a: float32x2_t) -> f32; #[link_name = "llvm.aarch64.neon.fminv.f32.v4f32"] - fn vminvq_f32_(a: f32x4) -> f32; + fn vminvq_f32_(a: float32x4_t) -> f32; #[link_name = "llvm.aarch64.neon.fminv.f64.v2f64"] - fn vminvq_f64_(a: f64x2) -> f64; + fn vminvq_f64_(a: float64x2_t) -> f64; } @@ -113,7 +144,7 @@ extern "C" { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(smaxv))] -pub unsafe fn vmaxv_s8(a: i8x8) -> i8 { +pub unsafe fn vmaxv_s8(a: int8x8_t) -> i8 { vmaxv_s8_(a) } @@ -121,7 +152,7 @@ pub unsafe fn vmaxv_s8(a: i8x8) -> i8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(smaxv))] -pub unsafe fn vmaxvq_s8(a: i8x16) -> i8 { +pub unsafe fn vmaxvq_s8(a: int8x16_t) -> i8 { vmaxvq_s8_(a) } @@ -129,7 +160,7 @@ pub unsafe fn vmaxvq_s8(a: i8x16) -> i8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(smaxv))] -pub unsafe fn vmaxv_s16(a: i16x4) -> i16 { +pub unsafe fn vmaxv_s16(a: int16x4_t) -> i16 { vmaxv_s16_(a) } @@ -137,7 +168,7 @@ pub unsafe fn vmaxv_s16(a: i16x4) -> i16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(smaxv))] -pub unsafe fn vmaxvq_s16(a: i16x8) -> i16 { +pub unsafe fn vmaxvq_s16(a: int16x8_t) -> i16 { vmaxvq_s16_(a) } @@ -145,7 +176,7 @@ pub unsafe fn vmaxvq_s16(a: i16x8) -> i16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(smaxp))] -pub unsafe fn vmaxv_s32(a: i32x2) -> i32 { +pub unsafe fn vmaxv_s32(a: int32x2_t) -> i32 { vmaxv_s32_(a) } @@ -153,7 +184,7 @@ pub unsafe fn vmaxv_s32(a: i32x2) -> i32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(smaxv))] -pub unsafe fn vmaxvq_s32(a: i32x4) -> i32 { +pub unsafe fn vmaxvq_s32(a: int32x4_t) -> i32 { vmaxvq_s32_(a) } @@ -161,7 +192,7 @@ pub unsafe fn vmaxvq_s32(a: i32x4) -> i32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(umaxv))] -pub unsafe fn vmaxv_u8(a: u8x8) -> u8 { +pub unsafe fn vmaxv_u8(a: uint8x8_t) -> u8 { vmaxv_u8_(a) } @@ -169,7 +200,7 @@ pub unsafe fn vmaxv_u8(a: u8x8) -> u8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(umaxv))] -pub unsafe fn vmaxvq_u8(a: u8x16) -> u8 { +pub unsafe fn vmaxvq_u8(a: uint8x16_t) -> u8 { vmaxvq_u8_(a) } @@ -177,7 +208,7 @@ pub unsafe fn vmaxvq_u8(a: u8x16) -> u8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(umaxv))] -pub unsafe fn vmaxv_u16(a: u16x4) -> u16 { +pub unsafe fn vmaxv_u16(a: uint16x4_t) -> u16 { vmaxv_u16_(a) } @@ -185,7 +216,7 @@ pub unsafe fn vmaxv_u16(a: u16x4) -> u16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(umaxv))] -pub unsafe fn vmaxvq_u16(a: u16x8) -> u16 { +pub unsafe fn vmaxvq_u16(a: uint16x8_t) -> u16 { vmaxvq_u16_(a) } @@ -193,7 +224,7 @@ pub unsafe fn vmaxvq_u16(a: u16x8) -> u16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(umaxp))] -pub unsafe fn vmaxv_u32(a: u32x2) -> u32 { +pub unsafe fn vmaxv_u32(a: uint32x2_t) -> u32 { vmaxv_u32_(a) } @@ -201,7 +232,7 @@ pub unsafe fn vmaxv_u32(a: u32x2) -> u32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(umaxv))] -pub unsafe fn vmaxvq_u32(a: u32x4) -> u32 { +pub unsafe fn vmaxvq_u32(a: uint32x4_t) -> u32 { vmaxvq_u32_(a) } @@ -209,7 +240,7 @@ pub unsafe fn vmaxvq_u32(a: u32x4) -> u32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fmaxp))] -pub unsafe fn vmaxv_f32(a: f32x2) -> f32 { +pub unsafe fn vmaxv_f32(a: float32x2_t) -> f32 { vmaxv_f32_(a) } @@ -217,7 +248,7 @@ pub unsafe fn vmaxv_f32(a: f32x2) -> f32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fmaxv))] -pub unsafe fn vmaxvq_f32(a: f32x4) -> f32 { +pub unsafe fn vmaxvq_f32(a: float32x4_t) -> f32 { vmaxvq_f32_(a) } @@ -225,7 +256,7 @@ pub unsafe fn vmaxvq_f32(a: f32x4) -> f32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fmaxp))] -pub unsafe fn vmaxvq_f64(a: f64x2) -> f64 { +pub unsafe fn vmaxvq_f64(a: float64x2_t) -> f64 { vmaxvq_f64_(a) } @@ -233,7 +264,7 @@ pub unsafe fn vmaxvq_f64(a: f64x2) -> f64 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(sminv))] -pub unsafe fn vminv_s8(a: i8x8) -> i8 { +pub unsafe fn vminv_s8(a: int8x8_t) -> i8 { vminv_s8_(a) } @@ -241,7 +272,7 @@ pub unsafe fn vminv_s8(a: i8x8) -> i8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(sminv))] -pub unsafe fn vminvq_s8(a: i8x16) -> i8 { +pub unsafe fn vminvq_s8(a: int8x16_t) -> i8 { vminvq_s8_(a) } @@ -249,7 +280,7 @@ pub unsafe fn vminvq_s8(a: i8x16) -> i8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(sminv))] -pub unsafe fn vminv_s16(a: i16x4) -> i16 { +pub unsafe fn vminv_s16(a: int16x4_t) -> i16 { vminv_s16_(a) } @@ -257,7 +288,7 @@ pub unsafe fn vminv_s16(a: i16x4) -> i16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(sminv))] -pub unsafe fn vminvq_s16(a: i16x8) -> i16 { +pub unsafe fn vminvq_s16(a: int16x8_t) -> i16 { vminvq_s16_(a) } @@ -265,7 +296,7 @@ pub unsafe fn vminvq_s16(a: i16x8) -> i16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(sminp))] -pub unsafe fn vminv_s32(a: i32x2) -> i32 { +pub unsafe fn vminv_s32(a: int32x2_t) -> i32 { vminv_s32_(a) } @@ -273,7 +304,7 @@ pub unsafe fn vminv_s32(a: i32x2) -> i32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(sminv))] -pub unsafe fn vminvq_s32(a: i32x4) -> i32 { +pub unsafe fn vminvq_s32(a: int32x4_t) -> i32 { vminvq_s32_(a) } @@ -281,7 +312,7 @@ pub unsafe fn vminvq_s32(a: i32x4) -> i32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uminv))] -pub unsafe fn vminv_u8(a: u8x8) -> u8 { +pub unsafe fn vminv_u8(a: uint8x8_t) -> u8 { vminv_u8_(a) } @@ -289,7 +320,7 @@ pub unsafe fn vminv_u8(a: u8x8) -> u8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uminv))] -pub unsafe fn vminvq_u8(a: u8x16) -> u8 { +pub unsafe fn vminvq_u8(a: uint8x16_t) -> u8 { vminvq_u8_(a) } @@ -297,7 +328,7 @@ pub unsafe fn vminvq_u8(a: u8x16) -> u8 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uminv))] -pub unsafe fn vminv_u16(a: u16x4) -> u16 { +pub unsafe fn vminv_u16(a: uint16x4_t) -> u16 { vminv_u16_(a) } @@ -305,7 +336,7 @@ pub unsafe fn vminv_u16(a: u16x4) -> u16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uminv))] -pub unsafe fn vminvq_u16(a: u16x8) -> u16 { +pub unsafe fn vminvq_u16(a: uint16x8_t) -> u16 { vminvq_u16_(a) } @@ -313,7 +344,7 @@ pub unsafe fn vminvq_u16(a: u16x8) -> u16 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uminp))] -pub unsafe fn vminv_u32(a: u32x2) -> u32 { +pub unsafe fn vminv_u32(a: uint32x2_t) -> u32 { vminv_u32_(a) } @@ -321,7 +352,7 @@ pub unsafe fn vminv_u32(a: u32x2) -> u32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(uminv))] -pub unsafe fn vminvq_u32(a: u32x4) -> u32 { +pub unsafe fn vminvq_u32(a: uint32x4_t) -> u32 { vminvq_u32_(a) } @@ -329,7 +360,7 @@ pub unsafe fn vminvq_u32(a: u32x4) -> u32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fminp))] -pub unsafe fn vminv_f32(a: f32x2) -> f32 { +pub unsafe fn vminv_f32(a: float32x2_t) -> f32 { vminv_f32_(a) } @@ -337,7 +368,7 @@ pub unsafe fn vminv_f32(a: f32x2) -> f32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fminv))] -pub unsafe fn vminvq_f32(a: f32x4) -> f32 { +pub unsafe fn vminvq_f32(a: float32x4_t) -> f32 { vminvq_f32_(a) } @@ -345,253 +376,257 @@ pub unsafe fn vminvq_f32(a: f32x4) -> f32 { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(test, assert_instr(fminp))] -pub unsafe fn vminvq_f64(a: f64x2) -> f64 { +pub unsafe fn vminvq_f64(a: float64x2_t) -> f64 { vminvq_f64_(a) } #[cfg(test)] mod tests { - use simd::*; - use coresimd::aarch64::neon; use stdsimd_test::simd_test; + use simd::*; + use coresimd::aarch64::*; + use std::mem; #[simd_test = "neon"] - unsafe fn vadd_f64() { + unsafe fn test_vadd_f64() { let a = 1.; let b = 8.; let e = 9.; - let r = neon::vadd_f64(a, b); + let r: f64 = + mem::transmute(vadd_f64(mem::transmute(a), mem::transmute(b))); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_f64() { + unsafe fn test_vaddq_f64() { let a = f64x2::new(1., 2.); let b = f64x2::new(8., 7.); let e = f64x2::new(9., 9.); - let r = neon::vaddq_f64(a, b); + let r: f64x2 = vaddq_f64(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddd_s64() { - let a = 1; - let b = 8; - let e = 9; - let r = neon::vaddd_s64(a, b); + unsafe fn test_vaddd_s64() { + let a = 1_i64; + let b = 8_i64; + let e = 9_i64; + let r: i64 = + mem::transmute(vaddd_s64(mem::transmute(a), mem::transmute(b))); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddd_u64() { - let a = 1; - let b = 8; - let e = 9; - let r = neon::vaddd_u64(a, b); + unsafe fn test_vaddd_u64() { + let a = 1_u64; + let b = 8_u64; + let e = 9_u64; + let r: u64 = + mem::transmute(vaddd_u64(mem::transmute(a), mem::transmute(b))); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmaxv_s8() { - let r = neon::vmaxv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5)); + unsafe fn test_vmaxv_s8() { + let r = vmaxv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5).into_bits()); assert_eq!(r, 7_i8); } #[simd_test = "neon"] - unsafe fn vmaxvq_s8() { + unsafe fn test_vmaxvq_s8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let r = neon::vmaxvq_s8(i8x16::new( + let r = vmaxvq_s8(i8x16::new( 1, 2, 3, 4, -16, 6, 7, 5, 8, 1, 1, 1, 1, 1, 1, 1, - )); + ).into_bits()); assert_eq!(r, 8_i8); } #[simd_test = "neon"] - unsafe fn vmaxv_s16() { - let r = neon::vmaxv_s16(i16x4::new(1, 2, -4, 3)); + unsafe fn test_vmaxv_s16() { + let r = vmaxv_s16(i16x4::new(1, 2, -4, 3).into_bits()); assert_eq!(r, 3_i16); } #[simd_test = "neon"] - unsafe fn vmaxvq_s16() { - let r = neon::vmaxvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5)); + unsafe fn test_vmaxvq_s16() { + let r = vmaxvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5).into_bits()); assert_eq!(r, 7_i16); } #[simd_test = "neon"] - unsafe fn vmaxv_s32() { - let r = neon::vmaxv_s32(i32x2::new(1, -4)); + unsafe fn test_vmaxv_s32() { + let r = vmaxv_s32(i32x2::new(1, -4).into_bits()); assert_eq!(r, 1_i32); } #[simd_test = "neon"] - unsafe fn vmaxvq_s32() { - let r = neon::vmaxvq_s32(i32x4::new(1, 2, -32, 4)); + unsafe fn test_vmaxvq_s32() { + let r = vmaxvq_s32(i32x4::new(1, 2, -32, 4).into_bits()); assert_eq!(r, 4_i32); } #[simd_test = "neon"] - unsafe fn vmaxv_u8() { - let r = neon::vmaxv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5)); + unsafe fn test_vmaxv_u8() { + let r = vmaxv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5).into_bits()); assert_eq!(r, 8_u8); } #[simd_test = "neon"] - unsafe fn vmaxvq_u8() { + unsafe fn test_vmaxvq_u8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let r = neon::vmaxvq_u8(u8x16::new( + let r = vmaxvq_u8(u8x16::new( 1, 2, 3, 4, 16, 6, 7, 5, 8, 1, 1, 1, 1, 1, 1, 1, - )); + ).into_bits()); assert_eq!(r, 16_u8); } #[simd_test = "neon"] - unsafe fn vmaxv_u16() { - let r = neon::vmaxv_u16(u16x4::new(1, 2, 4, 3)); + unsafe fn test_vmaxv_u16() { + let r = vmaxv_u16(u16x4::new(1, 2, 4, 3).into_bits()); assert_eq!(r, 4_u16); } #[simd_test = "neon"] - unsafe fn vmaxvq_u16() { - let r = neon::vmaxvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5)); + unsafe fn test_vmaxvq_u16() { + let r = vmaxvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5).into_bits()); assert_eq!(r, 16_u16); } #[simd_test = "neon"] - unsafe fn vmaxv_u32() { - let r = neon::vmaxv_u32(u32x2::new(1, 4)); + unsafe fn test_vmaxv_u32() { + let r = vmaxv_u32(u32x2::new(1, 4).into_bits()); assert_eq!(r, 4_u32); } #[simd_test = "neon"] - unsafe fn vmaxvq_u32() { - let r = neon::vmaxvq_u32(u32x4::new(1, 2, 32, 4)); + unsafe fn test_vmaxvq_u32() { + let r = vmaxvq_u32(u32x4::new(1, 2, 32, 4).into_bits()); assert_eq!(r, 32_u32); } #[simd_test = "neon"] - unsafe fn vmaxv_f32() { - let r = neon::vmaxv_f32(f32x2::new(1., 4.)); + unsafe fn test_vmaxv_f32() { + let r = vmaxv_f32(f32x2::new(1., 4.).into_bits()); assert_eq!(r, 4_f32); } #[simd_test = "neon"] - unsafe fn vmaxvq_f32() { - let r = neon::vmaxvq_f32(f32x4::new(1., 2., 32., 4.)); + unsafe fn test_vmaxvq_f32() { + let r = vmaxvq_f32(f32x4::new(1., 2., 32., 4.).into_bits()); assert_eq!(r, 32_f32); } #[simd_test = "neon"] - unsafe fn vmaxvq_f64() { - let r = neon::vmaxvq_f64(f64x2::new(1., 4.)); + unsafe fn test_vmaxvq_f64() { + let r = vmaxvq_f64(f64x2::new(1., 4.).into_bits()); assert_eq!(r, 4_f64); } #[simd_test = "neon"] - unsafe fn vminv_s8() { - let r = neon::vminv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5)); + unsafe fn test_vminv_s8() { + let r = vminv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5).into_bits()); assert_eq!(r, -8_i8); } #[simd_test = "neon"] - unsafe fn vminvq_s8() { + unsafe fn test_vminvq_s8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let r = neon::vminvq_s8(i8x16::new( + let r = vminvq_s8(i8x16::new( 1, 2, 3, 4, -16, 6, 7, 5, 8, 1, 1, 1, 1, 1, 1, 1, - )); + ).into_bits()); assert_eq!(r, -16_i8); } #[simd_test = "neon"] - unsafe fn vminv_s16() { - let r = neon::vminv_s16(i16x4::new(1, 2, -4, 3)); + unsafe fn test_vminv_s16() { + let r = vminv_s16(i16x4::new(1, 2, -4, 3).into_bits()); assert_eq!(r, -4_i16); } #[simd_test = "neon"] - unsafe fn vminvq_s16() { - let r = neon::vminvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5)); + unsafe fn test_vminvq_s16() { + let r = vminvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5).into_bits()); assert_eq!(r, -16_i16); } #[simd_test = "neon"] - unsafe fn vminv_s32() { - let r = neon::vminv_s32(i32x2::new(1, -4)); + unsafe fn test_vminv_s32() { + let r = vminv_s32(i32x2::new(1, -4).into_bits()); assert_eq!(r, -4_i32); } #[simd_test = "neon"] - unsafe fn vminvq_s32() { - let r = neon::vminvq_s32(i32x4::new(1, 2, -32, 4)); + unsafe fn test_vminvq_s32() { + let r = vminvq_s32(i32x4::new(1, 2, -32, 4).into_bits()); assert_eq!(r, -32_i32); } #[simd_test = "neon"] - unsafe fn vminv_u8() { - let r = neon::vminv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5)); + unsafe fn test_vminv_u8() { + let r = vminv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5).into_bits()); assert_eq!(r, 1_u8); } #[simd_test = "neon"] - unsafe fn vminvq_u8() { + unsafe fn test_vminvq_u8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let r = neon::vminvq_u8(u8x16::new( + let r = vminvq_u8(u8x16::new( 1, 2, 3, 4, 16, 6, 7, 5, 8, 1, 1, 1, 1, 1, 1, 1, - )); + ).into_bits()); assert_eq!(r, 1_u8); } #[simd_test = "neon"] - unsafe fn vminv_u16() { - let r = neon::vminv_u16(u16x4::new(1, 2, 4, 3)); + unsafe fn test_vminv_u16() { + let r = vminv_u16(u16x4::new(1, 2, 4, 3).into_bits()); assert_eq!(r, 1_u16); } #[simd_test = "neon"] - unsafe fn vminvq_u16() { - let r = neon::vminvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5)); + unsafe fn test_vminvq_u16() { + let r = vminvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5).into_bits()); assert_eq!(r, 1_u16); } #[simd_test = "neon"] - unsafe fn vminv_u32() { - let r = neon::vminv_u32(u32x2::new(1, 4)); + unsafe fn test_vminv_u32() { + let r = vminv_u32(u32x2::new(1, 4).into_bits()); assert_eq!(r, 1_u32); } #[simd_test = "neon"] - unsafe fn vminvq_u32() { - let r = neon::vminvq_u32(u32x4::new(1, 2, 32, 4)); + unsafe fn test_vminvq_u32() { + let r = vminvq_u32(u32x4::new(1, 2, 32, 4).into_bits()); assert_eq!(r, 1_u32); } #[simd_test = "neon"] - unsafe fn vminv_f32() { - let r = neon::vminv_f32(f32x2::new(1., 4.)); + unsafe fn test_vminv_f32() { + let r = vminv_f32(f32x2::new(1., 4.).into_bits()); assert_eq!(r, 1_f32); } #[simd_test = "neon"] - unsafe fn vminvq_f32() { - let r = neon::vminvq_f32(f32x4::new(1., 2., 32., 4.)); + unsafe fn test_vminvq_f32() { + let r = vminvq_f32(f32x4::new(1., 2., 32., 4.).into_bits()); assert_eq!(r, 1_f32); } #[simd_test = "neon"] - unsafe fn vminvq_f64() { - let r = neon::vminvq_f64(f64x2::new(1., 4.)); + unsafe fn test_vminvq_f64() { + let r = vminvq_f64(f64x2::new(1., 4.).into_bits()); assert_eq!(r, 1_f64); } } diff --git a/coresimd/aarch64/v8.rs b/coresimd/aarch64/v8.rs index 5d6b0f23bc..b537df9a3c 100644 --- a/coresimd/aarch64/v8.rs +++ b/coresimd/aarch64/v8.rs @@ -22,17 +22,12 @@ pub unsafe fn _clz_u64(x: u64) -> u64 { x.leading_zeros() as u64 } -#[allow(dead_code)] -extern "C" { - #[link_name = "llvm.bitreverse.i64"] - fn rbit_u64(i: i64) -> i64; -} - /// Reverse the bit order. #[inline] #[cfg_attr(test, assert_instr(rbit))] pub unsafe fn _rbit_u64(x: u64) -> u64 { - rbit_u64(x as i64) as u64 + use intrinsics::bitreverse; + bitreverse(x) } /// Counts the leading most significant bits set. diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs index f1611e70b0..d778ed6f14 100644 --- a/coresimd/arm/mod.rs +++ b/coresimd/arm/mod.rs @@ -7,14 +7,21 @@ //! http://infocenter.arm.com/help/topic/com.arm.doc. //! ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf //! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics +#![allow(non_camel_case_types)] mod v6; pub use self::v6::*; +#[cfg(any(target_arch = "aarch64", target_feature = "v7"))] mod v7; +#[cfg(any(target_arch = "aarch64", target_feature = "v7"))] pub use self::v7::*; -#[cfg(target_feature = "neon")] +// NEON is supported on AArch64, and on ARM when built with the v7 and neon +// features. Building ARM without neon produces incorrect codegen. +#[cfg(any(target_arch = "aarch64", + all(target_feature = "v7", target_feature = "neon")))] mod neon; -#[cfg(target_feature = "neon")] +#[cfg(any(target_arch = "aarch64", + all(target_feature = "v7", target_feature = "neon")))] pub use self::neon::*; diff --git a/coresimd/arm/neon.rs b/coresimd/arm/neon.rs index e19fa7d4a3..43ce2bec13 100644 --- a/coresimd/arm/neon.rs +++ b/coresimd/arm/neon.rs @@ -2,610 +2,979 @@ #[cfg(test)] use stdsimd_test::assert_instr; - -use coresimd::simd_llvm::simd_add; +use coresimd::simd_llvm::*; use coresimd::simd::*; -use convert::{From, Into}; + +types! { + /// ARM-specific 64-bit wide vector of eight packed `i8`. + pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8); + /// ARM-specific 64-bit wide vector of eight packed `u8`. + pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); + /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`. + pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); + /// ARM-specific 64-bit wide vector of four packed `i16`. + pub struct int16x4_t(i16, i16, i16, i16); + /// ARM-specific 64-bit wide vector of four packed `u16`. + pub struct uint16x4_t(u16, u16, u16, u16); + // FIXME: ARM-specific 64-bit wide vector of four packed `f16`. + // pub struct float16x4_t(f16, f16, f16, f16); + /// ARM-specific 64-bit wide vector of four packed `u16`. + pub struct poly16x4_t(u16, u16, u16, u16); + /// ARM-specific 64-bit wide vector of two packed `i32`. + pub struct int32x2_t(i32, i32); + /// ARM-specific 64-bit wide vector of two packed `u32`. + pub struct uint32x2_t(u32, u32); + /// ARM-specific 64-bit wide vector of two packed `f32`. + pub struct float32x2_t(f32, f32); + /// ARM-specific 64-bit wide vector of one packed `i64`. + pub struct int64x1_t(i64); + /// ARM-specific 64-bit wide vector of one packed `u64`. + pub struct uint64x1_t(u64); + + /// ARM-specific 128-bit wide vector of sixteem packed `i8`. + pub struct int8x16_t( + i8, i8 ,i8, i8, i8, i8 ,i8, i8, + i8, i8 ,i8, i8, i8, i8 ,i8, i8, + ); + /// ARM-specific 128-bit wide vector of sixteen packed `u8`. + pub struct uint8x16_t( + u8, u8 ,u8, u8, u8, u8 ,u8, u8, + u8, u8 ,u8, u8, u8, u8 ,u8, u8, + ); + /// ARM-specific 128-bit wide vector of sixteen packed `u8`. + pub struct poly8x16_t( + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + ); + /// ARM-specific 128-bit wide vector of eight packed `i16`. + pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16); + /// ARM-specific 128-bit wide vector of eight packed `u16`. + pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); + // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`. + // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16); + /// ARM-specific 128-bit wide vector of eight packed `u16`. + pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); + /// ARM-specific 128-bit wide vector of four packed `i32`. + pub struct int32x4_t(i32, i32, i32, i32); + /// ARM-specific 128-bit wide vector of four packed `u32`. + pub struct uint32x4_t(u32, u32, u32, u32); + /// ARM-specific 128-bit wide vector of four packed `f32`. + pub struct float32x4_t(f32, f32, f32, f32); + /// ARM-specific 128-bit wide vector of two packed `i64`. + pub struct int64x2_t(i64, i64); + /// ARM-specific 128-bit wide vector of two packed `u64`. + pub struct uint64x2_t(u64, u64); +} + +impl_from_bits_!( + int8x8_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + uint8x8_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + int16x4_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + uint16x4_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + int32x2_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + uint32x2_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + int64x1_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + float32x2_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + poly8x8_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); +impl_from_bits_!( + poly16x4_t: u32x2, + i32x2, + f32x2, + u16x4, + i16x4, + u8x8, + i8x8, + b8x8 +); + +impl_from_bits_!( + int8x16_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + uint8x16_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + poly8x16_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + int16x8_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + uint16x8_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + poly16x8_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + int32x4_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + uint32x4_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + float32x4_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + int64x2_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); +impl_from_bits_!( + uint64x2_t: u64x2, + i64x2, + f64x2, + u32x4, + i32x4, + f32x4, + u16x8, + i16x8, + u8x16, + i8x16, + b8x16 +); + +#[allow(improper_ctypes)] +extern "C" { + #[cfg(target_arch = "aarch64")] + #[link_name = "llvm.aarch64.neon.frsqrte.v2f32"] + fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t; +} /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(add))] -pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))] +pub unsafe fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(fadd))] -pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))] +pub unsafe fn vadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t { simd_add(a, b) } /// Vector add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(fadd))] -pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 { +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))] +pub unsafe fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { simd_add(a, b) } /// Vector long add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(saddl))] -pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 { - let a = i16x8::from(a); - let b = i16x8::from(b); +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))] +pub unsafe fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t { + let a: int16x8_t = simd_cast(a); + let b: int16x8_t = simd_cast(b); simd_add(a, b) } /// Vector long add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(saddl))] -pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 { - let a = i32x4::from(a); - let b = i32x4::from(b); +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))] +pub unsafe fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { + let a: int32x4_t = simd_cast(a); + let b: int32x4_t = simd_cast(b); simd_add(a, b) } /// Vector long add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(saddl))] -pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 { - let a = i64x2::from(a); - let b = i64x2::from(b); +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))] +pub unsafe fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { + let a: int64x2_t = simd_cast(a); + let b: int64x2_t = simd_cast(b); simd_add(a, b) } /// Vector long add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(uaddl))] -pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 { - let a = u16x8::from(a); - let b = u16x8::from(b); +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))] +pub unsafe fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t { + let a: uint16x8_t = simd_cast(a); + let b: uint16x8_t = simd_cast(b); simd_add(a, b) } /// Vector long add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(uaddl))] -pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 { - let a = u32x4::from(a); - let b = u32x4::from(b); +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))] +pub unsafe fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t { + let a: uint32x4_t = simd_cast(a); + let b: uint32x4_t = simd_cast(b); simd_add(a, b) } /// Vector long add. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(uaddl))] -pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 { - let a = u64x2::from(a); - let b = u64x2::from(b); +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))] +pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { + let a: uint64x2_t = simd_cast(a); + let b: uint64x2_t = simd_cast(b); simd_add(a, b) } -#[allow(improper_ctypes)] -extern "C" { - // The Reference says this instruction is - // supported in v7/A32/A64: - #[link_name = "llvm.aarch64.neon.frsqrte.v2f32"] - fn frsqrte_v2f32(a: f32x2) -> f32x2; -} - -/// Reciprocal square-root estimate. -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(frsqrte))] -pub unsafe fn vrsqrte_f32(a: f32x2) -> f32x2 { - frsqrte_v2f32(a) -} - /// Vector narrow integer. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(xtn))] -pub unsafe fn vmovn_s16(a: i16x8) -> i8x8 { - a.into() +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))] +pub unsafe fn vmovn_s16(a: int16x8_t) -> int8x8_t { + simd_cast(a) } /// Vector narrow integer. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(xtn))] -pub unsafe fn vmovn_s32(a: i32x4) -> i16x4 { - a.into() +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))] +pub unsafe fn vmovn_s32(a: int32x4_t) -> int16x4_t { + simd_cast(a) } /// Vector narrow integer. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(xtn))] -pub unsafe fn vmovn_s64(a: i64x2) -> i32x2 { - a.into() +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))] +pub unsafe fn vmovn_s64(a: int64x2_t) -> int32x2_t { + simd_cast(a) } /// Vector narrow integer. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(xtn))] -pub unsafe fn vmovn_u16(a: u16x8) -> u8x8 { - a.into() +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))] +pub unsafe fn vmovn_u16(a: uint16x8_t) -> uint8x8_t { + simd_cast(a) } /// Vector narrow integer. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(xtn))] -pub unsafe fn vmovn_u32(a: u32x4) -> u16x4 { - a.into() +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))] +pub unsafe fn vmovn_u32(a: uint32x4_t) -> uint16x4_t { + simd_cast(a) } /// Vector narrow integer. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(xtn))] -pub unsafe fn vmovn_u64(a: u64x2) -> u32x2 { - a.into() +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))] +pub unsafe fn vmovn_u64(a: uint64x2_t) -> uint32x2_t { + simd_cast(a) } /// Vector long move. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))] -pub unsafe fn vmovl_s8(a: i8x8) -> i16x8 { - a.into() +pub unsafe fn vmovl_s8(a: int8x8_t) -> int16x8_t { + simd_cast(a) } /// Vector long move. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))] -pub unsafe fn vmovl_s16(a: i16x4) -> i32x4 { - a.into() +pub unsafe fn vmovl_s16(a: int16x4_t) -> int32x4_t { + simd_cast(a) } /// Vector long move. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))] -pub unsafe fn vmovl_s32(a: i32x2) -> i64x2 { - a.into() +pub unsafe fn vmovl_s32(a: int32x2_t) -> int64x2_t { + simd_cast(a) } /// Vector long move. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))] -pub unsafe fn vmovl_u8(a: u8x8) -> u16x8 { - a.into() +pub unsafe fn vmovl_u8(a: uint8x8_t) -> uint16x8_t { + simd_cast(a) } /// Vector long move. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))] -pub unsafe fn vmovl_u16(a: u16x4) -> u32x4 { - a.into() +pub unsafe fn vmovl_u16(a: uint16x4_t) -> uint32x4_t { + simd_cast(a) } /// Vector long move. #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))] -pub unsafe fn vmovl_u32(a: u32x2) -> u64x2 { - a.into() +pub unsafe fn vmovl_u32(a: uint32x2_t) -> uint64x2_t { + simd_cast(a) +} + +/// Reciprocal square-root estimate. +#[cfg(target_arch = "aarch64")] +// FIXME (https://github.com/rust-lang-nursery/stdsimd/issues/383) +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(frsqrte))] +pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t { + frsqrte_v2f32(a) } #[cfg(test)] mod tests { use stdsimd_test::simd_test; use simd::*; - use coresimd::arm::neon; + use coresimd::arm::*; + use std::mem; #[simd_test = "neon"] - unsafe fn vadd_s8() { + unsafe fn test_vadd_s8() { let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); let b = i8x8::new(8, 7, 6, 5, 4, 3, 2, 1); let e = i8x8::new(9, 9, 9, 9, 9, 9, 9, 9); - let r = neon::vadd_s8(a, b); + let r: i8x8 = vadd_s8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_s8() { + unsafe fn test_vaddq_s8() { let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); let b = i8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1); let e = i8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9); - let r = neon::vaddq_s8(a, b); + let r: i8x16 = vaddq_s8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vadd_s16() { + unsafe fn test_vadd_s16() { let a = i16x4::new(1, 2, 3, 4); let b = i16x4::new(8, 7, 6, 5); let e = i16x4::new(9, 9, 9, 9); - let r = neon::vadd_s16(a, b); + let r: i16x4 = vadd_s16(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_s16() { + unsafe fn test_vaddq_s16() { let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let b = i16x8::new(8, 7, 6, 5, 4, 3, 2, 1); let e = i16x8::new(9, 9, 9, 9, 9, 9, 9, 9); - let r = neon::vaddq_s16(a, b); + let r: i16x8 = vaddq_s16(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vadd_s32() { + unsafe fn test_vadd_s32() { let a = i32x2::new(1, 2); let b = i32x2::new(8, 7); let e = i32x2::new(9, 9); - let r = neon::vadd_s32(a, b); + let r: i32x2 = vadd_s32(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_s32() { + unsafe fn test_vaddq_s32() { let a = i32x4::new(1, 2, 3, 4); let b = i32x4::new(8, 7, 6, 5); let e = i32x4::new(9, 9, 9, 9); - let r = neon::vaddq_s32(a, b); + let r: i32x4 = vaddq_s32(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vadd_u8() { + unsafe fn test_vadd_u8() { let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); let b = u8x8::new(8, 7, 6, 5, 4, 3, 2, 1); let e = u8x8::new(9, 9, 9, 9, 9, 9, 9, 9); - let r = neon::vadd_u8(a, b); + let r: u8x8 = vadd_u8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_u8() { + unsafe fn test_vaddq_u8() { let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); let b = u8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1); let e = u8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9); - let r = neon::vaddq_u8(a, b); + let r: u8x16 = vaddq_u8(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vadd_u16() { + unsafe fn test_vadd_u16() { let a = u16x4::new(1, 2, 3, 4); let b = u16x4::new(8, 7, 6, 5); let e = u16x4::new(9, 9, 9, 9); - let r = neon::vadd_u16(a, b); + let r: u16x4 = vadd_u16(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_u16() { + unsafe fn test_vaddq_u16() { let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let b = u16x8::new(8, 7, 6, 5, 4, 3, 2, 1); let e = u16x8::new(9, 9, 9, 9, 9, 9, 9, 9); - let r = neon::vaddq_u16(a, b); + let r: u16x8 = vaddq_u16(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vadd_u32() { + unsafe fn test_vadd_u32() { let a = u32x2::new(1, 2); let b = u32x2::new(8, 7); let e = u32x2::new(9, 9); - let r = neon::vadd_u32(a, b); + let r: u32x2 = vadd_u32(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_u32() { + unsafe fn test_vaddq_u32() { let a = u32x4::new(1, 2, 3, 4); let b = u32x4::new(8, 7, 6, 5); let e = u32x4::new(9, 9, 9, 9); - let r = neon::vaddq_u32(a, b); + let r: u32x4 = vaddq_u32(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vadd_f32() { + unsafe fn test_vadd_f32() { let a = f32x2::new(1., 2.); let b = f32x2::new(8., 7.); let e = f32x2::new(9., 9.); - let r = neon::vadd_f32(a, b); + let r: f32x2 = vadd_f32(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddq_f32() { + unsafe fn test_vaddq_f32() { let a = f32x4::new(1., 2., 3., 4.); let b = f32x4::new(8., 7., 6., 5.); let e = f32x4::new(9., 9., 9., 9.); - let r = neon::vaddq_f32(a, b); + let r: f32x4 = vaddq_f32(a.into_bits(), b.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddl_s8() { + unsafe fn test_vaddl_s8() { let v = ::std::i8::MAX; let a = i8x8::new(v, v, v, v, v, v, v, v); let v = 2 * (v as i16); let e = i16x8::new(v, v, v, v, v, v, v, v); - let r = neon::vaddl_s8(a, a); + let r: i16x8 = vaddl_s8(a.into_bits(), a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddl_s16() { + unsafe fn test_vaddl_s16() { let v = ::std::i16::MAX; let a = i16x4::new(v, v, v, v); let v = 2 * (v as i32); let e = i32x4::new(v, v, v, v); - let r = neon::vaddl_s16(a, a); + let r: i32x4 = vaddl_s16(a.into_bits(), a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddl_s32() { + unsafe fn test_vaddl_s32() { let v = ::std::i32::MAX; let a = i32x2::new(v, v); let v = 2 * (v as i64); let e = i64x2::new(v, v); - let r = neon::vaddl_s32(a, a); + let r: i64x2 = vaddl_s32(a.into_bits(), a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddl_u8() { + unsafe fn test_vaddl_u8() { let v = ::std::u8::MAX; let a = u8x8::new(v, v, v, v, v, v, v, v); let v = 2 * (v as u16); let e = u16x8::new(v, v, v, v, v, v, v, v); - let r = neon::vaddl_u8(a, a); + let r: u16x8 = vaddl_u8(a.into_bits(), a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddl_u16() { + unsafe fn test_vaddl_u16() { let v = ::std::u16::MAX; let a = u16x4::new(v, v, v, v); let v = 2 * (v as u32); let e = u32x4::new(v, v, v, v); - let r = neon::vaddl_u16(a, a); + let r: u32x4 = vaddl_u16(a.into_bits(), a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vaddl_u32() { + unsafe fn test_vaddl_u32() { let v = ::std::u32::MAX; let a = u32x2::new(v, v); let v = 2 * (v as u64); let e = u64x2::new(v, v); - let r = neon::vaddl_u32(a, a); + let r: u64x2 = vaddl_u32(a.into_bits(), a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vrsqrt_f32() { - let a = f32x2::new(1.0, 2.0); - let e = f32x2::new(0.9980469, 0.7050781); - let r = neon::vrsqrte_f32(a); - assert_eq!(r, e); - } - - #[simd_test = "neon"] - unsafe fn vmovn_s16() { + unsafe fn test_vmovn_s16() { let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = neon::vmovn_s16(a); + let r: i8x8 = vmovn_s16(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovn_s32() { + unsafe fn test_vmovn_s32() { let a = i32x4::new(1, 2, 3, 4); let e = i16x4::new(1, 2, 3, 4); - let r = neon::vmovn_s32(a); + let r: i16x4 = vmovn_s32(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovn_s64() { + unsafe fn test_vmovn_s64() { let a = i64x2::new(1, 2); let e = i32x2::new(1, 2); - let r = neon::vmovn_s64(a); + let r: i32x2 = vmovn_s64(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovn_u16() { + unsafe fn test_vmovn_u16() { let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = neon::vmovn_u16(a); + let r: u8x8 = vmovn_u16(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovn_u32() { + unsafe fn test_vmovn_u32() { let a = u32x4::new(1, 2, 3, 4); let e = u16x4::new(1, 2, 3, 4); - let r = neon::vmovn_u32(a); + let r: u16x4 = vmovn_u32(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovn_u64() { + unsafe fn test_vmovn_u64() { let a = u64x2::new(1, 2); let e = u32x2::new(1, 2); - let r = neon::vmovn_u64(a); + let r: u32x2 = vmovn_u64(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovl_s8() { + unsafe fn test_vmovl_s8() { let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = neon::vmovl_s8(a); + let r: i16x8 = vmovl_s8(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovl_s16() { + unsafe fn test_vmovl_s16() { let e = i32x4::new(1, 2, 3, 4); let a = i16x4::new(1, 2, 3, 4); - let r = neon::vmovl_s16(a); + let r: i32x4 = vmovl_s16(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovl_s32() { + unsafe fn test_vmovl_s32() { let e = i64x2::new(1, 2); let a = i32x2::new(1, 2); - let r = neon::vmovl_s32(a); + let r: i64x2 = vmovl_s32(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovl_u8() { + unsafe fn test_vmovl_u8() { let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8); let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = neon::vmovl_u8(a); + let r: u16x8 = vmovl_u8(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovl_u16() { + unsafe fn test_vmovl_u16() { let e = u32x4::new(1, 2, 3, 4); let a = u16x4::new(1, 2, 3, 4); - let r = neon::vmovl_u16(a); + let r: u32x4 = vmovl_u16(a.into_bits()).into_bits(); assert_eq!(r, e); } #[simd_test = "neon"] - unsafe fn vmovl_u32() { + unsafe fn test_vmovl_u32() { let e = u64x2::new(1, 2); let a = u32x2::new(1, 2); - let r = neon::vmovl_u32(a); + let r: u64x2 = vmovl_u32(a.into_bits()).into_bits(); + assert_eq!(r, e); + } + + #[cfg(target_arch = "aarch64")] + #[simd_test = "neon"] + unsafe fn test_vrsqrt_f32() { + let a = f32x2::new(1.0, 2.0); + let e = f32x2::new(0.9980469, 0.7050781); + let r: f32x2 = vrsqrte_f32(a.into_bits()).into_bits(); assert_eq!(r, e); } } diff --git a/coresimd/arm/v7.rs b/coresimd/arm/v7.rs index 8e7abef756..aefd7c7465 100644 --- a/coresimd/arm/v7.rs +++ b/coresimd/arm/v7.rs @@ -14,38 +14,41 @@ use stdsimd_test::assert_instr; /// Count Leading Zeros. #[inline] -#[cfg_attr(test, assert_instr(clz))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))] +// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382 +// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))] pub unsafe fn _clz_u8(x: u8) -> u8 { x.leading_zeros() as u8 } /// Count Leading Zeros. #[inline] -#[cfg_attr(test, assert_instr(clz))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))] +// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382 +// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))] pub unsafe fn _clz_u16(x: u16) -> u16 { x.leading_zeros() as u16 } /// Count Leading Zeros. #[inline] -#[cfg_attr(test, assert_instr(clz))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))] +// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382 +// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))] pub unsafe fn _clz_u32(x: u32) -> u32 { x.leading_zeros() as u32 } /// Reverse the bit order. #[inline] -#[cfg_attr(test, assert_instr(rbit))] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc +#[cfg_attr(test, assert_instr(rbit))] pub unsafe fn _rbit_u32(x: u32) -> u32 { - rbit_u32(x as i32) as u32 -} - -#[allow(dead_code)] -extern "C" { - #[link_name = "llvm.bitreverse.i32"] - fn rbit_u32(i: i32) -> i32; + use intrinsics::bitreverse; + bitreverse(x) } #[cfg(test)] diff --git a/coresimd/macros.rs b/coresimd/macros.rs new file mode 100644 index 0000000000..5d28abd41f --- /dev/null +++ b/coresimd/macros.rs @@ -0,0 +1,23 @@ +//! Utility macros. + +#[allow(unused)] +macro_rules! types { + ($( + $(#[$doc:meta])* + pub struct $name:ident($($fields:tt)*); + )*) => ($( + $(#[$doc])* + #[derive(Copy, Debug)] + #[allow(non_camel_case_types)] + #[repr(simd)] + pub struct $name($($fields)*); + + #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))] + impl ::clone::Clone for $name { + #[inline] // currently needed for correctness + fn clone(&self) -> $name { + *self + } + } + )*) +} diff --git a/coresimd/mod.rs b/coresimd/mod.rs index 088142d17c..5e4361619e 100644 --- a/coresimd/mod.rs +++ b/coresimd/mod.rs @@ -1,5 +1,8 @@ //! `coresimd` +#[macro_use] +mod macros; + #[macro_use] mod ppsv; diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs index 8590ad41f2..840a6af92f 100644 --- a/coresimd/ppsv/mod.rs +++ b/coresimd/ppsv/mod.rs @@ -59,18 +59,19 @@ pub trait IntoBits: ::marker::Sized { fn into_bits(self) -> T; } -// FromBits implies IntoBits +// FromBits implies IntoBits. impl IntoBits for T where U: FromBits, { #[inline] fn into_bits(self) -> U { + debug_assert!(::mem::size_of::() == ::mem::size_of::()); U::from_bits(self) } } -// FromBits (and thus IntoBits) is reflexive +// FromBits (and thus IntoBits) is reflexive. impl FromBits for T { #[inline] fn from_bits(t: Self) -> Self { diff --git a/coresimd/ppsv/v128.rs b/coresimd/ppsv/v128.rs index c0beb3ada1..670c819962 100644 --- a/coresimd/ppsv/v128.rs +++ b/coresimd/ppsv/v128.rs @@ -78,6 +78,72 @@ simd_f_ty! { /// A 128-bit vector with 2 `f64` lanes. } +#[cfg(target_arch = "x86")] +use coresimd::arch::x86::{__m128, __m128d, __m128i}; +#[cfg(target_arch = "x86_64")] +use coresimd::arch::x86_64::{__m128, __m128d, __m128i}; + +macro_rules! from_bits_x86 { + ($id:ident, $elem_ty:ident, $test_mod:ident) => { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + impl_from_bits_!($id: __m128, __m128i, __m128d); + } +} + +#[cfg(all(target_arch = "arm", target_feature = "v7"))] +use coresimd::arch::arm::{// FIXME: float16x8_t, + float32x4_t, + int16x8_t, + int32x4_t, + int64x2_t, + int8x16_t, + poly16x8_t, + poly8x16_t, + uint16x8_t, + uint32x4_t, + uint64x2_t, + uint8x16_t}; + +#[cfg(target_arch = "aarch64")] +use coresimd::arch::aarch64::{// FIXME: float16x8_t, + float32x4_t, + float64x2_t, + int16x8_t, + int32x4_t, + int64x2_t, + int8x16_t, + poly16x8_t, + poly8x16_t, + uint16x8_t, + uint32x4_t, + uint64x2_t, + uint8x16_t}; + +macro_rules! from_bits_arm { + ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => { + #[cfg(any(all(target_arch = "arm", target_feature = "v7"), target_arch = "aarch64"))] + impl_from_bits_!( + $id: + int8x16_t, + uint8x16_t, + int16x8_t, + uint16x8_t, + int32x4_t, + uint32x4_t, + int64x2_t, + uint64x2_t, + // FIXME: float16x8_t, + float32x4_t, + poly8x16_t, + poly16x8_t + ); + #[cfg(target_arch = "aarch64")] + impl_from_bits_!( + $id: float64x2_t + ); + } +} + impl_from_bits!( u64x2: u64, u64x2_from_bits, @@ -92,6 +158,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(u64x2, u64, u64x2_from_bits_x86); +from_bits_arm!(u64x2, u64, u64x2_from_bits_arm, u64x2_from_bits_aarch64); + impl_from_bits!( i64x2: i64, i64x2_from_bits, @@ -106,6 +175,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(i64x2, i64, i64x2_from_bits_x86); +from_bits_arm!(i64x2, i64, i64x2_from_bits_arm, i64x2_from_bits_aarch64); + impl_from_bits!( f64x2: f64, f64x2_from_bits, @@ -120,6 +192,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(f64x2, f64, f64x2_from_bits_x86); +from_bits_arm!(f64x2, f64, f64x2_from_bits_arm, f64x2_from_bits_aarch64); + impl_from_bits!( u32x4: u32, u32x4_from_bits, @@ -134,6 +209,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(u32x4, u32, u32x4_from_bits_x86); +from_bits_arm!(u32x4, u32, u32x4_from_bits_arm, u32x4_from_bits_aarch64); + impl_from_bits!( i32x4: i32, i32x4_from_bits, @@ -148,6 +226,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(i32x4, i32, i32x4_from_bits_x86); +from_bits_arm!(i32x4, i32, i32x4_from_bits_arm, i32x4_from_bits_aarch64); + impl_from_bits!( f32x4: f32, f32x4_from_bits, @@ -162,6 +243,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(f32x4, f32, f32x4_from_bits_x86); +from_bits_arm!(f32x4, f32, f32x4_from_bits_arm, f32x4_from_bits_aarch64); + impl_from_bits!( u16x8: u16, u16x8_from_bits, @@ -176,6 +260,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(u16x8, u16, u16x8_from_bits_x86); +from_bits_arm!(u16x8, u16, u16x8_from_bits_arm, u16x8_from_bits_aarch64); + impl_from_bits!( i16x8: i16, i16x8_from_bits, @@ -190,6 +277,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(i16x8, i16, i16x8_from_bits_x86); +from_bits_arm!(i16x8, i16, i16x8_from_bits_arm, i16x8_from_bits_aarch64); + impl_from_bits!( u8x16: u8, u8x16_from_bits, @@ -204,6 +294,9 @@ impl_from_bits!( i8x16, b8x16 ); +from_bits_x86!(u8x16, u8, u8x16_from_bits_x86); +from_bits_arm!(u8x16, u8, u8x16_from_bits_arm, u8x16_from_bits_aarch64); + impl_from_bits!( i8x16: i8, i8x16_from_bits, @@ -218,32 +311,8 @@ impl_from_bits!( u8x16, b8x16 ); - -#[cfg(target_arch = "x86")] -use coresimd::arch::x86::{__m128, __m128d, __m128i}; -#[cfg(target_arch = "x86_64")] -use coresimd::arch::x86_64::{__m128, __m128d, __m128i}; - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(f64x2: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u64x2: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i64x2: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(f32x4: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u32x4: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i32x4: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u16x8: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i16x8: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u8x16: __m128, __m128i, __m128d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i8x16: __m128, __m128i, __m128d); +from_bits_x86!(i8x16, i8, i8x16_from_bits_x86); +from_bits_arm!(i8x16, i8, i8x16_from_bits_arm, i8x16_from_bits_aarch64); impl_from!( f64x2: f64, diff --git a/coresimd/ppsv/v256.rs b/coresimd/ppsv/v256.rs index f88a209b96..b2dfc65905 100644 --- a/coresimd/ppsv/v256.rs +++ b/coresimd/ppsv/v256.rs @@ -94,6 +94,18 @@ simd_f_ty! { /// A 256-bit vector with 4 `f64` lanes. } +#[cfg(target_arch = "x86")] +use coresimd::arch::x86::{__m256, __m256d, __m256i}; +#[cfg(target_arch = "x86_64")] +use coresimd::arch::x86_64::{__m256, __m256d, __m256i}; + +macro_rules! from_bits_x86 { + ($id:ident, $elem_ty:ident, $test_mod:ident) => { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + impl_from_bits_!($id: __m256, __m256i, __m256d); + } +} + impl_from_bits!( i8x32: i8, i8x32_from_bits, @@ -108,6 +120,8 @@ impl_from_bits!( u8x32, b8x32 ); +from_bits_x86!(i8x32, i8, i8x32_from_bits_x86); + impl_from_bits!( u8x32: u8, u8x32_from_bits, @@ -122,6 +136,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(u8x32, u8, u8x32_from_bits_x86); + impl_from_bits!( i16x16: i16, i16x16_from_bits, @@ -136,6 +152,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(i16x16, i16, i16x16_from_bits_x86); + impl_from_bits!( u16x16: u16, u16x16_from_bits, @@ -150,6 +168,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(u16x16, u16, u16x16_from_bits_x86); + impl_from_bits!( i32x8: i32, i32x8_from_bits, @@ -164,6 +184,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(i32x8, i32, i32x8_from_bits_x86); + impl_from_bits!( u32x8: u32, u32x8_from_bits, @@ -178,6 +200,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(u32x8, u32, u32x8_from_bits_x86); + impl_from_bits!( f32x8: f32, f32x8_from_bits, @@ -192,6 +216,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(f32x8, f32, f32x8_from_bits_x86); + impl_from_bits!( i64x4: i64, i64x4_from_bits, @@ -206,6 +232,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(i64x4, i64, i64x4_from_bits_x86); + impl_from_bits!( u64x4: u64, u64x4_from_bits, @@ -220,6 +248,8 @@ impl_from_bits!( i8x32, b8x32 ); +from_bits_x86!(u64x4, u64, u64x4_from_bits_x86); + impl_from_bits!( f64x4: f64, f64x4_from_bits, @@ -234,32 +264,7 @@ impl_from_bits!( i8x32, b8x32 ); - -#[cfg(target_arch = "x86")] -use coresimd::arch::x86::{__m256, __m256d, __m256i}; -#[cfg(target_arch = "x86_64")] -use coresimd::arch::x86_64::{__m256, __m256d, __m256i}; - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(f64x4: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u64x4: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i64x4: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(f32x8: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u32x8: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i32x8: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u16x16: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i16x16: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u8x32: __m256, __m256i, __m256d); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i8x32: __m256, __m256i, __m256d); +from_bits_x86!(f64x4, f64, f64x4_from_bits_x86); impl_from!( f64x4: f64, diff --git a/coresimd/ppsv/v64.rs b/coresimd/ppsv/v64.rs index 6da839861d..a82511dd9e 100644 --- a/coresimd/ppsv/v64.rs +++ b/coresimd/ppsv/v64.rs @@ -57,6 +57,73 @@ simd_f_ty! { /// A 64-bit vector with 2 `f32` lanes. } +#[cfg(target_arch = "x86")] +use coresimd::arch::x86::__m64; + +#[cfg(target_arch = "x86_64")] +use coresimd::arch::x86_64::__m64; + +macro_rules! from_bits_x86 { + ($id:ident, $elem_ty:ident, $test_mod:ident) => { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + impl_from_bits_!($id: __m64); + } +} + +#[cfg(all(target_arch = "arm", target_feature = "v7"))] +use coresimd::arch::arm::{// FIXME: float16x4_t, + float32x2_t, + int16x4_t, + int32x2_t, + int64x1_t, + int8x8_t, + poly16x4_t, + poly8x8_t, + uint16x4_t, + uint32x2_t, + uint64x1_t, + uint8x8_t}; + +#[cfg(target_arch = "aarch64")] +use coresimd::arch::aarch64::{// FIXME: float16x4_t, + float32x2_t, + float64x1_t, + int16x4_t, + int32x2_t, + int64x1_t, + int8x8_t, + poly16x4_t, + poly8x8_t, + uint16x4_t, + uint32x2_t, + uint64x1_t, + uint8x8_t}; + +macro_rules! from_bits_arm { + ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => { + #[cfg(any(all(target_arch = "arm", target_feature = "v7"), target_arch = "aarch64"))] + impl_from_bits_!( + $id: + int64x1_t, + uint64x1_t, + uint32x2_t, + int32x2_t, + float32x2_t, + uint16x4_t, + int16x4_t, + // FIXME: float16x4_t + poly16x4_t, + uint8x8_t, + int8x8_t, + poly8x8_t + ); + #[cfg(target_arch = "aarch64")] + impl_from_bits_!( + $id: float64x1_t + ); + } +} + impl_from_bits!( u32x2: u32, u32x2_from_bits, @@ -68,6 +135,9 @@ impl_from_bits!( i8x8, b8x8 ); +from_bits_x86!(u32x2, u32, u32x2_from_bits_x86); +from_bits_arm!(u32x2, u32, u32x2_from_bits_arm, u32x2_from_bits_aarch64); + impl_from_bits!( i32x2: i32, i32x2_from_bits, @@ -79,6 +149,9 @@ impl_from_bits!( i8x8, b8x8 ); +from_bits_x86!(i32x2, i32, i32x2_from_bits_x86); +from_bits_arm!(i32x2, i32, i32x2_from_bits_arm, i32x2_from_bits_aarch64); + impl_from_bits!( f32x2: f32, f32x2_from_bits, @@ -90,6 +163,9 @@ impl_from_bits!( i8x8, b8x8 ); +from_bits_x86!(f32x2, f32, f32x2_from_bits_x86); +from_bits_arm!(f32x2, f32, f32x2_from_bits_arm, f32x2_from_bits_aarch64); + impl_from_bits!( u16x4: u16, u16x4_from_bits, @@ -100,6 +176,9 @@ impl_from_bits!( i8x8, b8x8 ); +from_bits_x86!(u16x4, u16, u16x4_from_bits_x86); +from_bits_arm!(u16x4, u16, u16x4_from_bits_arm, u16x4_from_bits_aarch64); + impl_from_bits!( i16x4: i16, i16x4_from_bits, @@ -110,6 +189,9 @@ impl_from_bits!( i8x8, b8x8 ); +from_bits_x86!(i16x4, i16, i16x4_from_bits_x86); +from_bits_arm!(i16x4, i16, i16x4_from_bits_arm, i16x4_from_bits_aarch64); + impl_from_bits!( u8x8: u8, u8x8_from_bits, @@ -120,6 +202,9 @@ impl_from_bits!( i8x8, b8x8 ); +from_bits_x86!(u8x8, u8, u8x8_from_bits_x86); +from_bits_arm!(u8x8, u8, u8x8_from_bits_arm, u8x8_from_bits_aarch64); + impl_from_bits!( i8x8: i8, i8x8_from_bits, @@ -130,27 +215,8 @@ impl_from_bits!( u8x8, b8x8 ); - -#[cfg(target_arch = "x86")] -use coresimd::arch::x86::__m64; - -#[cfg(target_arch = "x86_64")] -use coresimd::arch::x86_64::__m64; - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(f32x2: __m64); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u32x2: __m64); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i32x2: __m64); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u16x4: __m64); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i16x4: __m64); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(u8x8: __m64); -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -impl_from_bits_!(i8x8: __m64); +from_bits_x86!(i8x8, i8, i8x8_from_bits_x86); +from_bits_arm!(i8x8, i8, i8x8_from_bits_arm, i8x8_from_bits_aarch64); impl_from!( f32x2: f32, diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs index 32915c3329..653e0f8bf7 100644 --- a/coresimd/x86/mod.rs +++ b/coresimd/x86/mod.rs @@ -6,27 +6,6 @@ use mem; #[macro_use] mod macros; -macro_rules! types { - ($( - $(#[$doc:meta])* - pub struct $name:ident($($fields:tt)*); - )*) => ($( - $(#[$doc])* - #[derive(Copy, Debug)] - #[allow(non_camel_case_types)] - #[repr(simd)] - pub struct $name($($fields)*); - - #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))] - impl Clone for $name { - #[inline] // currently needed for correctness - fn clone(&self) -> $name { - *self - } - } - )*) -} - types! { /// 64-bit wide integer vector type, x86-specific /// @@ -459,12 +438,12 @@ impl m256iExt for __m256i { } } -use coresimd::simd::{b8x16, b8x32, b8x8, f32x4, f32x8, f64x2, f64x4, i16x16, - i16x4, i16x8, i32x2, i32x4, i32x8, i64x2, i64x4, i8x16, - i8x32, i8x8, u16x16, u16x4, u16x8, u32x2, u32x4, u32x8, - u64x2, u64x4, u8x16, u8x32, u8x8}; +use coresimd::simd::{b8x16, b8x32, b8x8, f32x2, f32x4, f32x8, f64x2, f64x4, + i16x16, i16x4, i16x8, i32x2, i32x4, i32x8, i64x2, i64x4, + i8x16, i8x32, i8x8, u16x16, u16x4, u16x8, u32x2, u32x4, + u32x8, u64x2, u64x4, u8x16, u8x32, u8x8}; -impl_from_bits_!(__m64: u32x2, i32x2, u16x4, i16x4, u8x8, i8x8, b8x8); +impl_from_bits_!(__m64: u32x2, i32x2, f32x2, u16x4, i16x4, u8x8, i8x8, b8x8); impl_from_bits_!( __m128: u64x2, i64x2, diff --git a/crates/coresimd/tests/v128.rs b/crates/coresimd/tests/v128.rs index 8eb1e1801c..d880e3a23d 100644 --- a/crates/coresimd/tests/v128.rs +++ b/crates/coresimd/tests/v128.rs @@ -1,7 +1,8 @@ //! coresimd 128-bit wide vector tests #![cfg_attr(feature = "strict", deny(warnings))] -#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)] +#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float, + cfg_target_feature)] #![allow(unused_imports, dead_code)] #[cfg(test)] @@ -30,7 +31,7 @@ macro_rules! vector_impl { mod ppsv; #[cfg(test)] -use std::marker; +use std::{marker, mem}; #[cfg(all(test, target_arch = "aarch64"))] use std::cmp; diff --git a/crates/coresimd/tests/v16.rs b/crates/coresimd/tests/v16.rs index b44c03f281..71d6cdb44f 100644 --- a/crates/coresimd/tests/v16.rs +++ b/crates/coresimd/tests/v16.rs @@ -1,7 +1,8 @@ //! coresimd 16-bit wide vector tests #![cfg_attr(feature = "strict", deny(warnings))] -#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)] +#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float, + cfg_target_feature)] #![allow(unused_imports, dead_code)] #[cfg(test)] @@ -30,7 +31,7 @@ macro_rules! vector_impl { mod ppsv; #[cfg(test)] -use std::marker; +use std::{marker, mem}; #[cfg(all(test, target_arch = "aarch64"))] use std::cmp; diff --git a/crates/coresimd/tests/v256.rs b/crates/coresimd/tests/v256.rs index e4f7416d2d..5149ba0204 100644 --- a/crates/coresimd/tests/v256.rs +++ b/crates/coresimd/tests/v256.rs @@ -1,7 +1,8 @@ //! coresimd 256-bit wide vector tests #![cfg_attr(feature = "strict", deny(warnings))] -#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)] +#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float, + cfg_target_feature)] #![allow(unused_imports)] #[cfg(test)] @@ -30,7 +31,7 @@ macro_rules! vector_impl { mod ppsv; #[cfg(test)] -use std::marker; +use std::{marker, mem}; #[cfg(all(test, target_arch = "aarch64"))] use std::cmp; diff --git a/crates/coresimd/tests/v32.rs b/crates/coresimd/tests/v32.rs index 83991fc8b2..007546833d 100644 --- a/crates/coresimd/tests/v32.rs +++ b/crates/coresimd/tests/v32.rs @@ -1,7 +1,8 @@ //! coresimd 32-bit wide vector tests #![cfg_attr(feature = "strict", deny(warnings))] -#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)] +#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float, + cfg_target_feature)] #![allow(unused_imports, dead_code)] #[cfg(test)] @@ -30,7 +31,7 @@ macro_rules! vector_impl { mod ppsv; #[cfg(test)] -use std::marker; +use std::{marker, mem}; #[cfg(all(test, target_arch = "aarch64"))] use std::cmp; diff --git a/crates/coresimd/tests/v512.rs b/crates/coresimd/tests/v512.rs index 6420ecb68e..31563f6f0d 100644 --- a/crates/coresimd/tests/v512.rs +++ b/crates/coresimd/tests/v512.rs @@ -1,7 +1,8 @@ //! coresimd 512-bit wide vector tests #![cfg_attr(feature = "strict", deny(warnings))] -#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)] +#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float, + cfg_target_feature)] #![allow(unused_imports)] #[cfg(test)] @@ -30,7 +31,7 @@ macro_rules! vector_impl { mod ppsv; #[cfg(test)] -use std::marker; +use std::{marker, mem}; #[cfg(all(test, target_arch = "aarch64"))] use std::cmp; diff --git a/crates/coresimd/tests/v64.rs b/crates/coresimd/tests/v64.rs index 5434b4c5ab..ce27e809e3 100644 --- a/crates/coresimd/tests/v64.rs +++ b/crates/coresimd/tests/v64.rs @@ -1,7 +1,8 @@ //! coresimd 64-bit wide vector tests #![cfg_attr(feature = "strict", deny(warnings))] -#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)] +#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float, + cfg_target_feature)] #![allow(unused_imports, dead_code)] #[cfg(test)] @@ -30,7 +31,7 @@ macro_rules! vector_impl { mod ppsv; #[cfg(test)] -use std::marker; +use std::{marker, mem}; #[cfg(all(test, target_arch = "aarch64"))] use std::cmp; diff --git a/crates/simd-test-macro/src/lib.rs b/crates/simd-test-macro/src/lib.rs index 366ad7d29e..c3c570a696 100644 --- a/crates/simd-test-macro/src/lib.rs +++ b/crates/simd-test-macro/src/lib.rs @@ -77,7 +77,7 @@ pub fn simd_test( .expect(&format!("target triple contained no \"-\": {}", target)) { "i686" | "x86_64" | "i586" => "is_x86_feature_detected", - "arm" => "is_arm_feature_detected", + "arm" | "armv7" => "is_arm_feature_detected", "aarch64" => "is_aarch64_feature_detected", "powerpc64" => "is_powerpc64_feature_detected", "mips" | "mipsel" => { diff --git a/examples/nbody.rs b/examples/nbody.rs index 4abe850d8e..b78ce8be67 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -43,21 +43,21 @@ impl Frsqrt for f64x2 { }; Self::new(u.extract(0), u.extract(1)) } - #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), - target_feature = "neon"))] + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { #[cfg(target_arch = "arm")] use stdsimd::arch::arm::*; #[cfg(target_arch = "aarch64")] use stdsimd::arch::aarch64::*; - unsafe { vrsqrte_f32((*self).into()).into() } + let t: f32x2 = (*self).into(); + let t: f32x2 = unsafe { vrsqrte_f32(t.into_bits()).into_bits() }; + t.into() } #[cfg(not(any(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse"), - all(any(target_arch = "arm", - target_arch = "aarch64"), + all(target_arch = "aarch64", target_feature = "neon"))))] { self.replace(0, 1. / self.extract(0).sqrt()); diff --git a/stdsimd/arch/detect/arch/arm.rs b/stdsimd/arch/detect/arch/arm.rs index 852e9b8d2c..08b6a7e87e 100644 --- a/stdsimd/arch/detect/arch/arm.rs +++ b/stdsimd/arch/detect/arch/arm.rs @@ -11,6 +11,10 @@ macro_rules! is_arm_feature_detected { cfg!(target_feature = "pmull") || $crate::arch::detect::check_for($crate::arch::detect::Feature::pmull) }; + ("v7") => { compile_error!("\"v7\" feature cannot be detected at run-time") }; + ("vfp2") => { compile_error!("\"vfp2\" feature cannot be detected at run-time") }; + ("vfp3") => { compile_error!("\"vfp3\" feature cannot be detected at run-time") }; + ("vfp4") => { compile_error!("\"vfp4\" feature cannot be detected at run-time") }; ($t:tt) => { compile_error!(concat!("unknown arm target feature: ", $t)) }; }