From b3eb737eb167779fce64d11e8ee760221e5cae1a Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Sun, 18 Mar 2018 11:07:44 +0100
Subject: [PATCH] add arm neon vector types

---
 ci/run.sh                         |  10 +
 coresimd/aarch64/mod.rs           |   2 -
 coresimd/aarch64/neon.rs          | 329 +++++++------
 coresimd/aarch64/v8.rs            |   9 +-
 coresimd/arm/mod.rs               |  11 +-
 coresimd/arm/neon.rs              | 737 ++++++++++++++++++++++--------
 coresimd/arm/v7.rs                |  27 +-
 coresimd/macros.rs                |  23 +
 coresimd/mod.rs                   |   3 +
 coresimd/ppsv/mod.rs              |   5 +-
 coresimd/ppsv/v128.rs             | 121 +++--
 coresimd/ppsv/v256.rs             |  57 +--
 coresimd/ppsv/v64.rs              | 108 ++++-
 coresimd/x86/mod.rs               |  31 +-
 crates/coresimd/tests/v128.rs     |   5 +-
 crates/coresimd/tests/v16.rs      |   5 +-
 crates/coresimd/tests/v256.rs     |   5 +-
 crates/coresimd/tests/v32.rs      |   5 +-
 crates/coresimd/tests/v512.rs     |   5 +-
 crates/coresimd/tests/v64.rs      |   5 +-
 crates/simd-test-macro/src/lib.rs |   2 +-
 examples/nbody.rs                 |  10 +-
 stdsimd/arch/detect/arch/arm.rs   |   4 +
 23 files changed, 1046 insertions(+), 473 deletions(-)
 create mode 100644 coresimd/macros.rs

diff --git a/ci/run.sh b/ci/run.sh
index 1ca1180d18..a27eaa463b 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -12,6 +12,16 @@ export RUST_TEST_THREADS=1
 
 FEATURES="strict,$FEATURES"
 
+# FIXME: on armv7 neon intrinsics require the neon target-feature to be
+# unconditionally enabled.
+case ${TARGET} in
+    armv7*)
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+neon"
+        ;;
+    *)
+        ;;
+esac
+
 echo "RUSTFLAGS=${RUSTFLAGS}"
 echo "FEATURES=${FEATURES}"
 echo "OBJDUMP=${OBJDUMP}"
diff --git a/coresimd/aarch64/mod.rs b/coresimd/aarch64/mod.rs
index f8e0fbd40c..5c794e3750 100644
--- a/coresimd/aarch64/mod.rs
+++ b/coresimd/aarch64/mod.rs
@@ -11,7 +11,5 @@
 mod v8;
 pub use self::v8::*;
 
-#[cfg(target_feature = "neon")]
 mod neon;
-#[cfg(target_feature = "neon")]
 pub use self::neon::*;
diff --git a/coresimd/aarch64/neon.rs b/coresimd/aarch64/neon.rs
index d4f3032e51..2b272c7427 100644
--- a/coresimd/aarch64/neon.rs
+++ b/coresimd/aarch64/neon.rs
@@ -6,20 +6,51 @@
 use stdsimd_test::assert_instr;
 use coresimd::simd_llvm::simd_add;
 use coresimd::simd::*;
+use coresimd::arm::*;
+
+types! {
+    /// ARM-specific 64-bit wide vector of one packed `f64`.
+    pub struct float64x1_t(f64); // FIXME: check this!
+    /// ARM-specific 128-bit wide vector of two packed `f64`.
+    pub struct float64x2_t(f64, f64);
+}
+impl_from_bits_!(
+    float64x1_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    float64x2_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
-pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 {
-    a + b
+pub unsafe fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
+    simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
-pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 {
+pub unsafe fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     simd_add(a, b)
 }
 
@@ -27,85 +58,85 @@ pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
-    a + b
+pub unsafe fn vaddd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
-    a + b
+pub unsafe fn vaddd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_add(a, b)
 }
 
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.aarch64.neon.smaxv.i8.v8i8"]
-    fn vmaxv_s8_(a: i8x8) -> i8;
+    fn vmaxv_s8_(a: int8x8_t) -> i8;
     #[link_name = "llvm.aarch64.neon.smaxv.i8.6i8"]
-    fn vmaxvq_s8_(a: i8x16) -> i8;
+    fn vmaxvq_s8_(a: int8x16_t) -> i8;
     #[link_name = "llvm.aarch64.neon.smaxv.i16.v4i16"]
-    fn vmaxv_s16_(a: i16x4) -> i16;
+    fn vmaxv_s16_(a: int16x4_t) -> i16;
     #[link_name = "llvm.aarch64.neon.smaxv.i16.v8i16"]
-    fn vmaxvq_s16_(a: i16x8) -> i16;
+    fn vmaxvq_s16_(a: int16x8_t) -> i16;
     #[link_name = "llvm.aarch64.neon.smaxv.i32.v2i32"]
-    fn vmaxv_s32_(a: i32x2) -> i32;
+    fn vmaxv_s32_(a: int32x2_t) -> i32;
     #[link_name = "llvm.aarch64.neon.smaxv.i32.v4i32"]
-    fn vmaxvq_s32_(a: i32x4) -> i32;
+    fn vmaxvq_s32_(a: int32x4_t) -> i32;
 
     #[link_name = "llvm.aarch64.neon.umaxv.i8.v8i8"]
-    fn vmaxv_u8_(a: u8x8) -> u8;
+    fn vmaxv_u8_(a: uint8x8_t) -> u8;
     #[link_name = "llvm.aarch64.neon.umaxv.i8.6i8"]
-    fn vmaxvq_u8_(a: u8x16) -> u8;
+    fn vmaxvq_u8_(a: uint8x16_t) -> u8;
     #[link_name = "llvm.aarch64.neon.umaxv.i16.v4i16"]
-    fn vmaxv_u16_(a: u16x4) -> u16;
+    fn vmaxv_u16_(a: uint16x4_t) -> u16;
     #[link_name = "llvm.aarch64.neon.umaxv.i16.v8i16"]
-    fn vmaxvq_u16_(a: u16x8) -> u16;
+    fn vmaxvq_u16_(a: uint16x8_t) -> u16;
     #[link_name = "llvm.aarch64.neon.umaxv.i32.v2i32"]
-    fn vmaxv_u32_(a: u32x2) -> u32;
+    fn vmaxv_u32_(a: uint32x2_t) -> u32;
     #[link_name = "llvm.aarch64.neon.umaxv.i32.v4i32"]
-    fn vmaxvq_u32_(a: u32x4) -> u32;
+    fn vmaxvq_u32_(a: uint32x4_t) -> u32;
 
     #[link_name = "llvm.aarch64.neon.fmaxv.f32.v2f32"]
-    fn vmaxv_f32_(a: f32x2) -> f32;
+    fn vmaxv_f32_(a: float32x2_t) -> f32;
     #[link_name = "llvm.aarch64.neon.fmaxv.f32.v4f32"]
-    fn vmaxvq_f32_(a: f32x4) -> f32;
+    fn vmaxvq_f32_(a: float32x4_t) -> f32;
     #[link_name = "llvm.aarch64.neon.fmaxv.f64.v2f64"]
-    fn vmaxvq_f64_(a: f64x2) -> f64;
+    fn vmaxvq_f64_(a: float64x2_t) -> f64;
 
     #[link_name = "llvm.aarch64.neon.sminv.i8.v8i8"]
-    fn vminv_s8_(a: i8x8) -> i8;
+    fn vminv_s8_(a: int8x8_t) -> i8;
     #[link_name = "llvm.aarch64.neon.sminv.i8.6i8"]
-    fn vminvq_s8_(a: i8x16) -> i8;
+    fn vminvq_s8_(a: int8x16_t) -> i8;
     #[link_name = "llvm.aarch64.neon.sminv.i16.v4i16"]
-    fn vminv_s16_(a: i16x4) -> i16;
+    fn vminv_s16_(a: int16x4_t) -> i16;
     #[link_name = "llvm.aarch64.neon.sminv.i16.v8i16"]
-    fn vminvq_s16_(a: i16x8) -> i16;
+    fn vminvq_s16_(a: int16x8_t) -> i16;
     #[link_name = "llvm.aarch64.neon.sminv.i32.v2i32"]
-    fn vminv_s32_(a: i32x2) -> i32;
+    fn vminv_s32_(a: int32x2_t) -> i32;
     #[link_name = "llvm.aarch64.neon.sminv.i32.v4i32"]
-    fn vminvq_s32_(a: i32x4) -> i32;
+    fn vminvq_s32_(a: int32x4_t) -> i32;
 
     #[link_name = "llvm.aarch64.neon.uminv.i8.v8i8"]
-    fn vminv_u8_(a: u8x8) -> u8;
+    fn vminv_u8_(a: uint8x8_t) -> u8;
     #[link_name = "llvm.aarch64.neon.uminv.i8.6i8"]
-    fn vminvq_u8_(a: u8x16) -> u8;
+    fn vminvq_u8_(a: uint8x16_t) -> u8;
     #[link_name = "llvm.aarch64.neon.uminv.i16.v4i16"]
-    fn vminv_u16_(a: u16x4) -> u16;
+    fn vminv_u16_(a: uint16x4_t) -> u16;
     #[link_name = "llvm.aarch64.neon.uminv.i16.v8i16"]
-    fn vminvq_u16_(a: u16x8) -> u16;
+    fn vminvq_u16_(a: uint16x8_t) -> u16;
     #[link_name = "llvm.aarch64.neon.uminv.i32.v2i32"]
-    fn vminv_u32_(a: u32x2) -> u32;
+    fn vminv_u32_(a: uint32x2_t) -> u32;
     #[link_name = "llvm.aarch64.neon.uminv.i32.v4i32"]
-    fn vminvq_u32_(a: u32x4) -> u32;
+    fn vminvq_u32_(a: uint32x4_t) -> u32;
 
     #[link_name = "llvm.aarch64.neon.fminv.f32.v2f32"]
-    fn vminv_f32_(a: f32x2) -> f32;
+    fn vminv_f32_(a: float32x2_t) -> f32;
     #[link_name = "llvm.aarch64.neon.fminv.f32.v4f32"]
-    fn vminvq_f32_(a: f32x4) -> f32;
+    fn vminvq_f32_(a: float32x4_t) -> f32;
     #[link_name = "llvm.aarch64.neon.fminv.f64.v2f64"]
-    fn vminvq_f64_(a: f64x2) -> f64;
+    fn vminvq_f64_(a: float64x2_t) -> f64;
 
 }
 
@@ -113,7 +144,7 @@ extern "C" {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smaxv))]
-pub unsafe fn vmaxv_s8(a: i8x8) -> i8 {
+pub unsafe fn vmaxv_s8(a: int8x8_t) -> i8 {
     vmaxv_s8_(a)
 }
 
@@ -121,7 +152,7 @@ pub unsafe fn vmaxv_s8(a: i8x8) -> i8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smaxv))]
-pub unsafe fn vmaxvq_s8(a: i8x16) -> i8 {
+pub unsafe fn vmaxvq_s8(a: int8x16_t) -> i8 {
     vmaxvq_s8_(a)
 }
 
@@ -129,7 +160,7 @@ pub unsafe fn vmaxvq_s8(a: i8x16) -> i8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smaxv))]
-pub unsafe fn vmaxv_s16(a: i16x4) -> i16 {
+pub unsafe fn vmaxv_s16(a: int16x4_t) -> i16 {
     vmaxv_s16_(a)
 }
 
@@ -137,7 +168,7 @@ pub unsafe fn vmaxv_s16(a: i16x4) -> i16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smaxv))]
-pub unsafe fn vmaxvq_s16(a: i16x8) -> i16 {
+pub unsafe fn vmaxvq_s16(a: int16x8_t) -> i16 {
     vmaxvq_s16_(a)
 }
 
@@ -145,7 +176,7 @@ pub unsafe fn vmaxvq_s16(a: i16x8) -> i16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smaxp))]
-pub unsafe fn vmaxv_s32(a: i32x2) -> i32 {
+pub unsafe fn vmaxv_s32(a: int32x2_t) -> i32 {
     vmaxv_s32_(a)
 }
 
@@ -153,7 +184,7 @@ pub unsafe fn vmaxv_s32(a: i32x2) -> i32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(smaxv))]
-pub unsafe fn vmaxvq_s32(a: i32x4) -> i32 {
+pub unsafe fn vmaxvq_s32(a: int32x4_t) -> i32 {
     vmaxvq_s32_(a)
 }
 
@@ -161,7 +192,7 @@ pub unsafe fn vmaxvq_s32(a: i32x4) -> i32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umaxv))]
-pub unsafe fn vmaxv_u8(a: u8x8) -> u8 {
+pub unsafe fn vmaxv_u8(a: uint8x8_t) -> u8 {
     vmaxv_u8_(a)
 }
 
@@ -169,7 +200,7 @@ pub unsafe fn vmaxv_u8(a: u8x8) -> u8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umaxv))]
-pub unsafe fn vmaxvq_u8(a: u8x16) -> u8 {
+pub unsafe fn vmaxvq_u8(a: uint8x16_t) -> u8 {
     vmaxvq_u8_(a)
 }
 
@@ -177,7 +208,7 @@ pub unsafe fn vmaxvq_u8(a: u8x16) -> u8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umaxv))]
-pub unsafe fn vmaxv_u16(a: u16x4) -> u16 {
+pub unsafe fn vmaxv_u16(a: uint16x4_t) -> u16 {
     vmaxv_u16_(a)
 }
 
@@ -185,7 +216,7 @@ pub unsafe fn vmaxv_u16(a: u16x4) -> u16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umaxv))]
-pub unsafe fn vmaxvq_u16(a: u16x8) -> u16 {
+pub unsafe fn vmaxvq_u16(a: uint16x8_t) -> u16 {
     vmaxvq_u16_(a)
 }
 
@@ -193,7 +224,7 @@ pub unsafe fn vmaxvq_u16(a: u16x8) -> u16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umaxp))]
-pub unsafe fn vmaxv_u32(a: u32x2) -> u32 {
+pub unsafe fn vmaxv_u32(a: uint32x2_t) -> u32 {
     vmaxv_u32_(a)
 }
 
@@ -201,7 +232,7 @@ pub unsafe fn vmaxv_u32(a: u32x2) -> u32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(umaxv))]
-pub unsafe fn vmaxvq_u32(a: u32x4) -> u32 {
+pub unsafe fn vmaxvq_u32(a: uint32x4_t) -> u32 {
     vmaxvq_u32_(a)
 }
 
@@ -209,7 +240,7 @@ pub unsafe fn vmaxvq_u32(a: u32x4) -> u32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fmaxp))]
-pub unsafe fn vmaxv_f32(a: f32x2) -> f32 {
+pub unsafe fn vmaxv_f32(a: float32x2_t) -> f32 {
     vmaxv_f32_(a)
 }
 
@@ -217,7 +248,7 @@ pub unsafe fn vmaxv_f32(a: f32x2) -> f32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fmaxv))]
-pub unsafe fn vmaxvq_f32(a: f32x4) -> f32 {
+pub unsafe fn vmaxvq_f32(a: float32x4_t) -> f32 {
     vmaxvq_f32_(a)
 }
 
@@ -225,7 +256,7 @@ pub unsafe fn vmaxvq_f32(a: f32x4) -> f32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fmaxp))]
-pub unsafe fn vmaxvq_f64(a: f64x2) -> f64 {
+pub unsafe fn vmaxvq_f64(a: float64x2_t) -> f64 {
     vmaxvq_f64_(a)
 }
 
@@ -233,7 +264,7 @@ pub unsafe fn vmaxvq_f64(a: f64x2) -> f64 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sminv))]
-pub unsafe fn vminv_s8(a: i8x8) -> i8 {
+pub unsafe fn vminv_s8(a: int8x8_t) -> i8 {
     vminv_s8_(a)
 }
 
@@ -241,7 +272,7 @@ pub unsafe fn vminv_s8(a: i8x8) -> i8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sminv))]
-pub unsafe fn vminvq_s8(a: i8x16) -> i8 {
+pub unsafe fn vminvq_s8(a: int8x16_t) -> i8 {
     vminvq_s8_(a)
 }
 
@@ -249,7 +280,7 @@ pub unsafe fn vminvq_s8(a: i8x16) -> i8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sminv))]
-pub unsafe fn vminv_s16(a: i16x4) -> i16 {
+pub unsafe fn vminv_s16(a: int16x4_t) -> i16 {
     vminv_s16_(a)
 }
 
@@ -257,7 +288,7 @@ pub unsafe fn vminv_s16(a: i16x4) -> i16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sminv))]
-pub unsafe fn vminvq_s16(a: i16x8) -> i16 {
+pub unsafe fn vminvq_s16(a: int16x8_t) -> i16 {
     vminvq_s16_(a)
 }
 
@@ -265,7 +296,7 @@ pub unsafe fn vminvq_s16(a: i16x8) -> i16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sminp))]
-pub unsafe fn vminv_s32(a: i32x2) -> i32 {
+pub unsafe fn vminv_s32(a: int32x2_t) -> i32 {
     vminv_s32_(a)
 }
 
@@ -273,7 +304,7 @@ pub unsafe fn vminv_s32(a: i32x2) -> i32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sminv))]
-pub unsafe fn vminvq_s32(a: i32x4) -> i32 {
+pub unsafe fn vminvq_s32(a: int32x4_t) -> i32 {
     vminvq_s32_(a)
 }
 
@@ -281,7 +312,7 @@ pub unsafe fn vminvq_s32(a: i32x4) -> i32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uminv))]
-pub unsafe fn vminv_u8(a: u8x8) -> u8 {
+pub unsafe fn vminv_u8(a: uint8x8_t) -> u8 {
     vminv_u8_(a)
 }
 
@@ -289,7 +320,7 @@ pub unsafe fn vminv_u8(a: u8x8) -> u8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uminv))]
-pub unsafe fn vminvq_u8(a: u8x16) -> u8 {
+pub unsafe fn vminvq_u8(a: uint8x16_t) -> u8 {
     vminvq_u8_(a)
 }
 
@@ -297,7 +328,7 @@ pub unsafe fn vminvq_u8(a: u8x16) -> u8 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uminv))]
-pub unsafe fn vminv_u16(a: u16x4) -> u16 {
+pub unsafe fn vminv_u16(a: uint16x4_t) -> u16 {
     vminv_u16_(a)
 }
 
@@ -305,7 +336,7 @@ pub unsafe fn vminv_u16(a: u16x4) -> u16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uminv))]
-pub unsafe fn vminvq_u16(a: u16x8) -> u16 {
+pub unsafe fn vminvq_u16(a: uint16x8_t) -> u16 {
     vminvq_u16_(a)
 }
 
@@ -313,7 +344,7 @@ pub unsafe fn vminvq_u16(a: u16x8) -> u16 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uminp))]
-pub unsafe fn vminv_u32(a: u32x2) -> u32 {
+pub unsafe fn vminv_u32(a: uint32x2_t) -> u32 {
     vminv_u32_(a)
 }
 
@@ -321,7 +352,7 @@ pub unsafe fn vminv_u32(a: u32x2) -> u32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uminv))]
-pub unsafe fn vminvq_u32(a: u32x4) -> u32 {
+pub unsafe fn vminvq_u32(a: uint32x4_t) -> u32 {
     vminvq_u32_(a)
 }
 
@@ -329,7 +360,7 @@ pub unsafe fn vminvq_u32(a: u32x4) -> u32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fminp))]
-pub unsafe fn vminv_f32(a: f32x2) -> f32 {
+pub unsafe fn vminv_f32(a: float32x2_t) -> f32 {
     vminv_f32_(a)
 }
 
@@ -337,7 +368,7 @@ pub unsafe fn vminv_f32(a: f32x2) -> f32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fminv))]
-pub unsafe fn vminvq_f32(a: f32x4) -> f32 {
+pub unsafe fn vminvq_f32(a: float32x4_t) -> f32 {
     vminvq_f32_(a)
 }
 
@@ -345,253 +376,257 @@ pub unsafe fn vminvq_f32(a: f32x4) -> f32 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fminp))]
-pub unsafe fn vminvq_f64(a: f64x2) -> f64 {
+pub unsafe fn vminvq_f64(a: float64x2_t) -> f64 {
     vminvq_f64_(a)
 }
 
 #[cfg(test)]
 mod tests {
-    use simd::*;
-    use coresimd::aarch64::neon;
     use stdsimd_test::simd_test;
+    use simd::*;
+    use coresimd::aarch64::*;
+    use std::mem;
 
     #[simd_test = "neon"]
-    unsafe fn vadd_f64() {
+    unsafe fn test_vadd_f64() {
         let a = 1.;
         let b = 8.;
         let e = 9.;
-        let r = neon::vadd_f64(a, b);
+        let r: f64 =
+            mem::transmute(vadd_f64(mem::transmute(a), mem::transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_f64() {
+    unsafe fn test_vaddq_f64() {
         let a = f64x2::new(1., 2.);
         let b = f64x2::new(8., 7.);
         let e = f64x2::new(9., 9.);
-        let r = neon::vaddq_f64(a, b);
+        let r: f64x2 = vaddq_f64(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddd_s64() {
-        let a = 1;
-        let b = 8;
-        let e = 9;
-        let r = neon::vaddd_s64(a, b);
+    unsafe fn test_vaddd_s64() {
+        let a = 1_i64;
+        let b = 8_i64;
+        let e = 9_i64;
+        let r: i64 =
+            mem::transmute(vaddd_s64(mem::transmute(a), mem::transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddd_u64() {
-        let a = 1;
-        let b = 8;
-        let e = 9;
-        let r = neon::vaddd_u64(a, b);
+    unsafe fn test_vaddd_u64() {
+        let a = 1_u64;
+        let b = 8_u64;
+        let e = 9_u64;
+        let r: u64 =
+            mem::transmute(vaddd_u64(mem::transmute(a), mem::transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_s8() {
-        let r = neon::vmaxv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5));
+    unsafe fn test_vmaxv_s8() {
+        let r = vmaxv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5).into_bits());
         assert_eq!(r, 7_i8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_s8() {
+    unsafe fn test_vmaxvq_s8() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let r = neon::vmaxvq_s8(i8x16::new(
+        let r = vmaxvq_s8(i8x16::new(
             1, 2, 3, 4,
             -16, 6, 7, 5,
             8, 1, 1, 1,
             1, 1, 1, 1,
-        ));
+        ).into_bits());
         assert_eq!(r, 8_i8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_s16() {
-        let r = neon::vmaxv_s16(i16x4::new(1, 2, -4, 3));
+    unsafe fn test_vmaxv_s16() {
+        let r = vmaxv_s16(i16x4::new(1, 2, -4, 3).into_bits());
         assert_eq!(r, 3_i16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_s16() {
-        let r = neon::vmaxvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5));
+    unsafe fn test_vmaxvq_s16() {
+        let r = vmaxvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5).into_bits());
         assert_eq!(r, 7_i16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_s32() {
-        let r = neon::vmaxv_s32(i32x2::new(1, -4));
+    unsafe fn test_vmaxv_s32() {
+        let r = vmaxv_s32(i32x2::new(1, -4).into_bits());
         assert_eq!(r, 1_i32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_s32() {
-        let r = neon::vmaxvq_s32(i32x4::new(1, 2, -32, 4));
+    unsafe fn test_vmaxvq_s32() {
+        let r = vmaxvq_s32(i32x4::new(1, 2, -32, 4).into_bits());
         assert_eq!(r, 4_i32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_u8() {
-        let r = neon::vmaxv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5));
+    unsafe fn test_vmaxv_u8() {
+        let r = vmaxv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5).into_bits());
         assert_eq!(r, 8_u8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_u8() {
+    unsafe fn test_vmaxvq_u8() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let r = neon::vmaxvq_u8(u8x16::new(
+        let r = vmaxvq_u8(u8x16::new(
             1, 2, 3, 4,
             16, 6, 7, 5,
             8, 1, 1, 1,
             1, 1, 1, 1,
-        ));
+        ).into_bits());
         assert_eq!(r, 16_u8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_u16() {
-        let r = neon::vmaxv_u16(u16x4::new(1, 2, 4, 3));
+    unsafe fn test_vmaxv_u16() {
+        let r = vmaxv_u16(u16x4::new(1, 2, 4, 3).into_bits());
         assert_eq!(r, 4_u16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_u16() {
-        let r = neon::vmaxvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5));
+    unsafe fn test_vmaxvq_u16() {
+        let r = vmaxvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5).into_bits());
         assert_eq!(r, 16_u16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_u32() {
-        let r = neon::vmaxv_u32(u32x2::new(1, 4));
+    unsafe fn test_vmaxv_u32() {
+        let r = vmaxv_u32(u32x2::new(1, 4).into_bits());
         assert_eq!(r, 4_u32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_u32() {
-        let r = neon::vmaxvq_u32(u32x4::new(1, 2, 32, 4));
+    unsafe fn test_vmaxvq_u32() {
+        let r = vmaxvq_u32(u32x4::new(1, 2, 32, 4).into_bits());
         assert_eq!(r, 32_u32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxv_f32() {
-        let r = neon::vmaxv_f32(f32x2::new(1., 4.));
+    unsafe fn test_vmaxv_f32() {
+        let r = vmaxv_f32(f32x2::new(1., 4.).into_bits());
         assert_eq!(r, 4_f32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_f32() {
-        let r = neon::vmaxvq_f32(f32x4::new(1., 2., 32., 4.));
+    unsafe fn test_vmaxvq_f32() {
+        let r = vmaxvq_f32(f32x4::new(1., 2., 32., 4.).into_bits());
         assert_eq!(r, 32_f32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmaxvq_f64() {
-        let r = neon::vmaxvq_f64(f64x2::new(1., 4.));
+    unsafe fn test_vmaxvq_f64() {
+        let r = vmaxvq_f64(f64x2::new(1., 4.).into_bits());
         assert_eq!(r, 4_f64);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_s8() {
-        let r = neon::vminv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5));
+    unsafe fn test_vminv_s8() {
+        let r = vminv_s8(i8x8::new(1, 2, 3, 4, -8, 6, 7, 5).into_bits());
         assert_eq!(r, -8_i8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_s8() {
+    unsafe fn test_vminvq_s8() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let r = neon::vminvq_s8(i8x16::new(
+        let r = vminvq_s8(i8x16::new(
             1, 2, 3, 4,
             -16, 6, 7, 5,
             8, 1, 1, 1,
             1, 1, 1, 1,
-        ));
+        ).into_bits());
         assert_eq!(r, -16_i8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_s16() {
-        let r = neon::vminv_s16(i16x4::new(1, 2, -4, 3));
+    unsafe fn test_vminv_s16() {
+        let r = vminv_s16(i16x4::new(1, 2, -4, 3).into_bits());
         assert_eq!(r, -4_i16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_s16() {
-        let r = neon::vminvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5));
+    unsafe fn test_vminvq_s16() {
+        let r = vminvq_s16(i16x8::new(1, 2, 7, 4, -16, 6, 7, 5).into_bits());
         assert_eq!(r, -16_i16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_s32() {
-        let r = neon::vminv_s32(i32x2::new(1, -4));
+    unsafe fn test_vminv_s32() {
+        let r = vminv_s32(i32x2::new(1, -4).into_bits());
         assert_eq!(r, -4_i32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_s32() {
-        let r = neon::vminvq_s32(i32x4::new(1, 2, -32, 4));
+    unsafe fn test_vminvq_s32() {
+        let r = vminvq_s32(i32x4::new(1, 2, -32, 4).into_bits());
         assert_eq!(r, -32_i32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_u8() {
-        let r = neon::vminv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5));
+    unsafe fn test_vminv_u8() {
+        let r = vminv_u8(u8x8::new(1, 2, 3, 4, 8, 6, 7, 5).into_bits());
         assert_eq!(r, 1_u8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_u8() {
+    unsafe fn test_vminvq_u8() {
         #[cfg_attr(rustfmt, rustfmt_skip)]
-        let r = neon::vminvq_u8(u8x16::new(
+        let r = vminvq_u8(u8x16::new(
             1, 2, 3, 4,
             16, 6, 7, 5,
             8, 1, 1, 1,
             1, 1, 1, 1,
-        ));
+        ).into_bits());
         assert_eq!(r, 1_u8);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_u16() {
-        let r = neon::vminv_u16(u16x4::new(1, 2, 4, 3));
+    unsafe fn test_vminv_u16() {
+        let r = vminv_u16(u16x4::new(1, 2, 4, 3).into_bits());
         assert_eq!(r, 1_u16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_u16() {
-        let r = neon::vminvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5));
+    unsafe fn test_vminvq_u16() {
+        let r = vminvq_u16(u16x8::new(1, 2, 7, 4, 16, 6, 7, 5).into_bits());
         assert_eq!(r, 1_u16);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_u32() {
-        let r = neon::vminv_u32(u32x2::new(1, 4));
+    unsafe fn test_vminv_u32() {
+        let r = vminv_u32(u32x2::new(1, 4).into_bits());
         assert_eq!(r, 1_u32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_u32() {
-        let r = neon::vminvq_u32(u32x4::new(1, 2, 32, 4));
+    unsafe fn test_vminvq_u32() {
+        let r = vminvq_u32(u32x4::new(1, 2, 32, 4).into_bits());
         assert_eq!(r, 1_u32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminv_f32() {
-        let r = neon::vminv_f32(f32x2::new(1., 4.));
+    unsafe fn test_vminv_f32() {
+        let r = vminv_f32(f32x2::new(1., 4.).into_bits());
         assert_eq!(r, 1_f32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_f32() {
-        let r = neon::vminvq_f32(f32x4::new(1., 2., 32., 4.));
+    unsafe fn test_vminvq_f32() {
+        let r = vminvq_f32(f32x4::new(1., 2., 32., 4.).into_bits());
         assert_eq!(r, 1_f32);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vminvq_f64() {
-        let r = neon::vminvq_f64(f64x2::new(1., 4.));
+    unsafe fn test_vminvq_f64() {
+        let r = vminvq_f64(f64x2::new(1., 4.).into_bits());
         assert_eq!(r, 1_f64);
     }
 }
diff --git a/coresimd/aarch64/v8.rs b/coresimd/aarch64/v8.rs
index 5d6b0f23bc..b537df9a3c 100644
--- a/coresimd/aarch64/v8.rs
+++ b/coresimd/aarch64/v8.rs
@@ -22,17 +22,12 @@ pub unsafe fn _clz_u64(x: u64) -> u64 {
     x.leading_zeros() as u64
 }
 
-#[allow(dead_code)]
-extern "C" {
-    #[link_name = "llvm.bitreverse.i64"]
-    fn rbit_u64(i: i64) -> i64;
-}
-
 /// Reverse the bit order.
 #[inline]
 #[cfg_attr(test, assert_instr(rbit))]
 pub unsafe fn _rbit_u64(x: u64) -> u64 {
-    rbit_u64(x as i64) as u64
+    use intrinsics::bitreverse;
+    bitreverse(x)
 }
 
 /// Counts the leading most significant bits set.
diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs
index f1611e70b0..d778ed6f14 100644
--- a/coresimd/arm/mod.rs
+++ b/coresimd/arm/mod.rs
@@ -7,14 +7,21 @@
 //! http://infocenter.arm.com/help/topic/com.arm.doc.
 //! ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
 //! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+#![allow(non_camel_case_types)]
 
 mod v6;
 pub use self::v6::*;
 
+#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 mod v7;
+#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 pub use self::v7::*;
 
-#[cfg(target_feature = "neon")]
+// NEON is supported on AArch64, and on ARM when built with the v7 and neon
+// features. Building ARM without neon produces incorrect codegen.
+#[cfg(any(target_arch = "aarch64",
+          all(target_feature = "v7", target_feature = "neon")))]
 mod neon;
-#[cfg(target_feature = "neon")]
+#[cfg(any(target_arch = "aarch64",
+          all(target_feature = "v7", target_feature = "neon")))]
 pub use self::neon::*;
diff --git a/coresimd/arm/neon.rs b/coresimd/arm/neon.rs
index e19fa7d4a3..43ce2bec13 100644
--- a/coresimd/arm/neon.rs
+++ b/coresimd/arm/neon.rs
@@ -2,610 +2,979 @@
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
-
-use coresimd::simd_llvm::simd_add;
+use coresimd::simd_llvm::*;
 use coresimd::simd::*;
-use convert::{From, Into};
+
+types! {
+    /// ARM-specific 64-bit wide vector of eight packed `i8`.
+    pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
+    /// ARM-specific 64-bit wide vector of eight packed `u8`.
+    pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`.
+    pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide vector of four packed `i16`.
+    pub struct int16x4_t(i16, i16, i16, i16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct uint16x4_t(u16, u16, u16, u16);
+    // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
+    // pub struct float16x4_t(f16, f16, f16, f16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct poly16x4_t(u16, u16, u16, u16);
+    /// ARM-specific 64-bit wide vector of two packed `i32`.
+    pub struct int32x2_t(i32, i32);
+    /// ARM-specific 64-bit wide vector of two packed `u32`.
+    pub struct uint32x2_t(u32, u32);
+    /// ARM-specific 64-bit wide vector of two packed `f32`.
+    pub struct float32x2_t(f32, f32);
+    /// ARM-specific 64-bit wide vector of one packed `i64`.
+    pub struct int64x1_t(i64);
+    /// ARM-specific 64-bit wide vector of one packed `u64`.
+    pub struct uint64x1_t(u64);
+
+    /// ARM-specific 128-bit wide vector of sixteem packed `i8`.
+    pub struct int8x16_t(
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct uint8x16_t(
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct poly8x16_t(
+        u8, u8, u8, u8, u8, u8, u8, u8,
+        u8, u8, u8, u8, u8, u8, u8, u8
+    );
+    /// ARM-specific 128-bit wide vector of eight packed `i16`.
+    pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
+    // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// ARM-specific 128-bit wide vector of four packed `i32`.
+    pub struct int32x4_t(i32, i32, i32, i32);
+    /// ARM-specific 128-bit wide vector of four packed `u32`.
+    pub struct uint32x4_t(u32, u32, u32, u32);
+    /// ARM-specific 128-bit wide vector of four packed `f32`.
+    pub struct float32x4_t(f32, f32, f32, f32);
+    /// ARM-specific 128-bit wide vector of two packed `i64`.
+    pub struct int64x2_t(i64, i64);
+    /// ARM-specific 128-bit wide vector of two packed `u64`.
+    pub struct uint64x2_t(u64, u64);
+}
+
+impl_from_bits_!(
+    int8x8_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    uint8x8_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    int16x4_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    uint16x4_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    int32x2_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    uint32x2_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    int64x1_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    float32x2_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    poly8x8_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+impl_from_bits_!(
+    poly16x4_t: u32x2,
+    i32x2,
+    f32x2,
+    u16x4,
+    i16x4,
+    u8x8,
+    i8x8,
+    b8x8
+);
+
+impl_from_bits_!(
+    int8x16_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    uint8x16_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    poly8x16_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    int16x8_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    uint16x8_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    poly16x8_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    int32x4_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    uint32x4_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    float32x4_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    int64x2_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+impl_from_bits_!(
+    uint64x2_t: u64x2,
+    i64x2,
+    f64x2,
+    u32x4,
+    i32x4,
+    f32x4,
+    u16x8,
+    i16x8,
+    u8x16,
+    i8x16,
+    b8x16
+);
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[cfg(target_arch = "aarch64")]
+    #[link_name = "llvm.aarch64.neon.frsqrte.v2f32"]
+    fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
+}
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(add))]
-pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(add))]
+pub unsafe fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(fadd))]
-pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))]
+pub unsafe fn vadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     simd_add(a, b)
 }
 
 /// Vector add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(fadd))]
-pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vadd))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fadd))]
+pub unsafe fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     simd_add(a, b)
 }
 
 /// Vector long add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(saddl))]
-pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
-    let a = i16x8::from(a);
-    let b = i16x8::from(b);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+pub unsafe fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
     simd_add(a, b)
 }
 
 /// Vector long add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(saddl))]
-pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
-    let a = i32x4::from(a);
-    let b = i32x4::from(b);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+pub unsafe fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
     simd_add(a, b)
 }
 
 /// Vector long add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(saddl))]
-pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
-    let a = i64x2::from(a);
-    let b = i64x2::from(b);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(saddl))]
+pub unsafe fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
     simd_add(a, b)
 }
 
 /// Vector long add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(uaddl))]
-pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
-    let a = u16x8::from(a);
-    let b = u16x8::from(b);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+pub unsafe fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
     simd_add(a, b)
 }
 
 /// Vector long add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(uaddl))]
-pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
-    let a = u32x4::from(a);
-    let b = u32x4::from(b);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+pub unsafe fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
     simd_add(a, b)
 }
 
 /// Vector long add.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(uaddl))]
-pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 {
-    let a = u64x2::from(a);
-    let b = u64x2::from(b);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vaddl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uaddl))]
+pub unsafe fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
     simd_add(a, b)
 }
 
-#[allow(improper_ctypes)]
-extern "C" {
-    // The Reference says this instruction is
-    // supported in v7/A32/A64:
-    #[link_name = "llvm.aarch64.neon.frsqrte.v2f32"]
-    fn frsqrte_v2f32(a: f32x2) -> f32x2;
-}
-
-/// Reciprocal square-root estimate.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(frsqrte))]
-pub unsafe fn vrsqrte_f32(a: f32x2) -> f32x2 {
-    frsqrte_v2f32(a)
-}
-
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(xtn))]
-pub unsafe fn vmovn_s16(a: i16x8) -> i8x8 {
-    a.into()
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s16(a: int16x8_t) -> int8x8_t {
+    simd_cast(a)
 }
 
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(xtn))]
-pub unsafe fn vmovn_s32(a: i32x4) -> i16x4 {
-    a.into()
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s32(a: int32x4_t) -> int16x4_t {
+    simd_cast(a)
 }
 
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(xtn))]
-pub unsafe fn vmovn_s64(a: i64x2) -> i32x2 {
-    a.into()
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_s64(a: int64x2_t) -> int32x2_t {
+    simd_cast(a)
 }
 
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(xtn))]
-pub unsafe fn vmovn_u16(a: u16x8) -> u8x8 {
-    a.into()
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u16(a: uint16x8_t) -> uint8x8_t {
+    simd_cast(a)
 }
 
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(xtn))]
-pub unsafe fn vmovn_u32(a: u32x4) -> u16x4 {
-    a.into()
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u32(a: uint32x4_t) -> uint16x4_t {
+    simd_cast(a)
 }
 
 /// Vector narrow integer.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(xtn))]
-pub unsafe fn vmovn_u64(a: u64x2) -> u32x2 {
-    a.into()
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(xtn))]
+pub unsafe fn vmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    simd_cast(a)
 }
 
 /// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
-pub unsafe fn vmovl_s8(a: i8x8) -> i16x8 {
-    a.into()
+pub unsafe fn vmovl_s8(a: int8x8_t) -> int16x8_t {
+    simd_cast(a)
 }
 
 /// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
-pub unsafe fn vmovl_s16(a: i16x4) -> i32x4 {
-    a.into()
+pub unsafe fn vmovl_s16(a: int16x4_t) -> int32x4_t {
+    simd_cast(a)
 }
 
 /// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sxtl))]
-pub unsafe fn vmovl_s32(a: i32x2) -> i64x2 {
-    a.into()
+pub unsafe fn vmovl_s32(a: int32x2_t) -> int64x2_t {
+    simd_cast(a)
 }
 
 /// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
-pub unsafe fn vmovl_u8(a: u8x8) -> u16x8 {
-    a.into()
+pub unsafe fn vmovl_u8(a: uint8x8_t) -> uint16x8_t {
+    simd_cast(a)
 }
 
 /// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
-pub unsafe fn vmovl_u16(a: u16x4) -> u32x4 {
-    a.into()
+pub unsafe fn vmovl_u16(a: uint16x4_t) -> uint32x4_t {
+    simd_cast(a)
 }
 
 /// Vector long move.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(sshll))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmovl))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uxtl))]
-pub unsafe fn vmovl_u32(a: u32x2) -> u64x2 {
-    a.into()
+pub unsafe fn vmovl_u32(a: uint32x2_t) -> uint64x2_t {
+    simd_cast(a)
+}
+
+/// Reciprocal square-root estimate.
+#[cfg(target_arch = "aarch64")]
+// FIXME (https://github.com/rust-lang-nursery/stdsimd/issues/383)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    frsqrte_v2f32(a)
 }
 
 #[cfg(test)]
 mod tests {
     use stdsimd_test::simd_test;
     use simd::*;
-    use coresimd::arm::neon;
+    use coresimd::arm::*;
+    use std::mem;
 
     #[simd_test = "neon"]
-    unsafe fn vadd_s8() {
+    unsafe fn test_vadd_s8() {
         let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b = i8x8::new(8, 7, 6, 5, 4, 3, 2, 1);
         let e = i8x8::new(9, 9, 9, 9, 9, 9, 9, 9);
-        let r = neon::vadd_s8(a, b);
+        let r: i8x8 = vadd_s8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_s8() {
+    unsafe fn test_vaddq_s8() {
         let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         let b = i8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
         let e = i8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
-        let r = neon::vaddq_s8(a, b);
+        let r: i8x16 = vaddq_s8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vadd_s16() {
+    unsafe fn test_vadd_s16() {
         let a = i16x4::new(1, 2, 3, 4);
         let b = i16x4::new(8, 7, 6, 5);
         let e = i16x4::new(9, 9, 9, 9);
-        let r = neon::vadd_s16(a, b);
+        let r: i16x4 = vadd_s16(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_s16() {
+    unsafe fn test_vaddq_s16() {
         let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b = i16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
         let e = i16x8::new(9, 9, 9, 9, 9, 9, 9, 9);
-        let r = neon::vaddq_s16(a, b);
+        let r: i16x8 = vaddq_s16(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vadd_s32() {
+    unsafe fn test_vadd_s32() {
         let a = i32x2::new(1, 2);
         let b = i32x2::new(8, 7);
         let e = i32x2::new(9, 9);
-        let r = neon::vadd_s32(a, b);
+        let r: i32x2 = vadd_s32(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_s32() {
+    unsafe fn test_vaddq_s32() {
         let a = i32x4::new(1, 2, 3, 4);
         let b = i32x4::new(8, 7, 6, 5);
         let e = i32x4::new(9, 9, 9, 9);
-        let r = neon::vaddq_s32(a, b);
+        let r: i32x4 = vaddq_s32(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vadd_u8() {
+    unsafe fn test_vadd_u8() {
         let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b = u8x8::new(8, 7, 6, 5, 4, 3, 2, 1);
         let e = u8x8::new(9, 9, 9, 9, 9, 9, 9, 9);
-        let r = neon::vadd_u8(a, b);
+        let r: u8x8 = vadd_u8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_u8() {
+    unsafe fn test_vaddq_u8() {
         let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         let b = u8x16::new(8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1);
         let e = u8x16::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
-        let r = neon::vaddq_u8(a, b);
+        let r: u8x16 = vaddq_u8(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vadd_u16() {
+    unsafe fn test_vadd_u16() {
         let a = u16x4::new(1, 2, 3, 4);
         let b = u16x4::new(8, 7, 6, 5);
         let e = u16x4::new(9, 9, 9, 9);
-        let r = neon::vadd_u16(a, b);
+        let r: u16x4 = vadd_u16(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_u16() {
+    unsafe fn test_vaddq_u16() {
         let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b = u16x8::new(8, 7, 6, 5, 4, 3, 2, 1);
         let e = u16x8::new(9, 9, 9, 9, 9, 9, 9, 9);
-        let r = neon::vaddq_u16(a, b);
+        let r: u16x8 = vaddq_u16(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vadd_u32() {
+    unsafe fn test_vadd_u32() {
         let a = u32x2::new(1, 2);
         let b = u32x2::new(8, 7);
         let e = u32x2::new(9, 9);
-        let r = neon::vadd_u32(a, b);
+        let r: u32x2 = vadd_u32(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_u32() {
+    unsafe fn test_vaddq_u32() {
         let a = u32x4::new(1, 2, 3, 4);
         let b = u32x4::new(8, 7, 6, 5);
         let e = u32x4::new(9, 9, 9, 9);
-        let r = neon::vaddq_u32(a, b);
+        let r: u32x4 = vaddq_u32(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vadd_f32() {
+    unsafe fn test_vadd_f32() {
         let a = f32x2::new(1., 2.);
         let b = f32x2::new(8., 7.);
         let e = f32x2::new(9., 9.);
-        let r = neon::vadd_f32(a, b);
+        let r: f32x2 = vadd_f32(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddq_f32() {
+    unsafe fn test_vaddq_f32() {
         let a = f32x4::new(1., 2., 3., 4.);
         let b = f32x4::new(8., 7., 6., 5.);
         let e = f32x4::new(9., 9., 9., 9.);
-        let r = neon::vaddq_f32(a, b);
+        let r: f32x4 = vaddq_f32(a.into_bits(), b.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddl_s8() {
+    unsafe fn test_vaddl_s8() {
         let v = ::std::i8::MAX;
         let a = i8x8::new(v, v, v, v, v, v, v, v);
         let v = 2 * (v as i16);
         let e = i16x8::new(v, v, v, v, v, v, v, v);
-        let r = neon::vaddl_s8(a, a);
+        let r: i16x8 = vaddl_s8(a.into_bits(), a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddl_s16() {
+    unsafe fn test_vaddl_s16() {
         let v = ::std::i16::MAX;
         let a = i16x4::new(v, v, v, v);
         let v = 2 * (v as i32);
         let e = i32x4::new(v, v, v, v);
-        let r = neon::vaddl_s16(a, a);
+        let r: i32x4 = vaddl_s16(a.into_bits(), a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddl_s32() {
+    unsafe fn test_vaddl_s32() {
         let v = ::std::i32::MAX;
         let a = i32x2::new(v, v);
         let v = 2 * (v as i64);
         let e = i64x2::new(v, v);
-        let r = neon::vaddl_s32(a, a);
+        let r: i64x2 = vaddl_s32(a.into_bits(), a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddl_u8() {
+    unsafe fn test_vaddl_u8() {
         let v = ::std::u8::MAX;
         let a = u8x8::new(v, v, v, v, v, v, v, v);
         let v = 2 * (v as u16);
         let e = u16x8::new(v, v, v, v, v, v, v, v);
-        let r = neon::vaddl_u8(a, a);
+        let r: u16x8 = vaddl_u8(a.into_bits(), a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddl_u16() {
+    unsafe fn test_vaddl_u16() {
         let v = ::std::u16::MAX;
         let a = u16x4::new(v, v, v, v);
         let v = 2 * (v as u32);
         let e = u32x4::new(v, v, v, v);
-        let r = neon::vaddl_u16(a, a);
+        let r: u32x4 = vaddl_u16(a.into_bits(), a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vaddl_u32() {
+    unsafe fn test_vaddl_u32() {
         let v = ::std::u32::MAX;
         let a = u32x2::new(v, v);
         let v = 2 * (v as u64);
         let e = u64x2::new(v, v);
-        let r = neon::vaddl_u32(a, a);
+        let r: u64x2 = vaddl_u32(a.into_bits(), a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vrsqrt_f32() {
-        let a = f32x2::new(1.0, 2.0);
-        let e = f32x2::new(0.9980469, 0.7050781);
-        let r = neon::vrsqrte_f32(a);
-        assert_eq!(r, e);
-    }
-
-    #[simd_test = "neon"]
-    unsafe fn vmovn_s16() {
+    unsafe fn test_vmovn_s16() {
         let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = neon::vmovn_s16(a);
+        let r: i8x8 = vmovn_s16(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovn_s32() {
+    unsafe fn test_vmovn_s32() {
         let a = i32x4::new(1, 2, 3, 4);
         let e = i16x4::new(1, 2, 3, 4);
-        let r = neon::vmovn_s32(a);
+        let r: i16x4 = vmovn_s32(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovn_s64() {
+    unsafe fn test_vmovn_s64() {
         let a = i64x2::new(1, 2);
         let e = i32x2::new(1, 2);
-        let r = neon::vmovn_s64(a);
+        let r: i32x2 = vmovn_s64(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovn_u16() {
+    unsafe fn test_vmovn_u16() {
         let a = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = neon::vmovn_u16(a);
+        let r: u8x8 = vmovn_u16(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovn_u32() {
+    unsafe fn test_vmovn_u32() {
         let a = u32x4::new(1, 2, 3, 4);
         let e = u16x4::new(1, 2, 3, 4);
-        let r = neon::vmovn_u32(a);
+        let r: u16x4 = vmovn_u32(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovn_u64() {
+    unsafe fn test_vmovn_u64() {
         let a = u64x2::new(1, 2);
         let e = u32x2::new(1, 2);
-        let r = neon::vmovn_u64(a);
+        let r: u32x2 = vmovn_u64(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovl_s8() {
+    unsafe fn test_vmovl_s8() {
         let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = neon::vmovl_s8(a);
+        let r: i16x8 = vmovl_s8(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovl_s16() {
+    unsafe fn test_vmovl_s16() {
         let e = i32x4::new(1, 2, 3, 4);
         let a = i16x4::new(1, 2, 3, 4);
-        let r = neon::vmovl_s16(a);
+        let r: i32x4 = vmovl_s16(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovl_s32() {
+    unsafe fn test_vmovl_s32() {
         let e = i64x2::new(1, 2);
         let a = i32x2::new(1, 2);
-        let r = neon::vmovl_s32(a);
+        let r: i64x2 = vmovl_s32(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovl_u8() {
+    unsafe fn test_vmovl_u8() {
         let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let r = neon::vmovl_u8(a);
+        let r: u16x8 = vmovl_u8(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovl_u16() {
+    unsafe fn test_vmovl_u16() {
         let e = u32x4::new(1, 2, 3, 4);
         let a = u16x4::new(1, 2, 3, 4);
-        let r = neon::vmovl_u16(a);
+        let r: u32x4 = vmovl_u16(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 
     #[simd_test = "neon"]
-    unsafe fn vmovl_u32() {
+    unsafe fn test_vmovl_u32() {
         let e = u64x2::new(1, 2);
         let a = u32x2::new(1, 2);
-        let r = neon::vmovl_u32(a);
+        let r: u64x2 = vmovl_u32(a.into_bits()).into_bits();
+        assert_eq!(r, e);
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    #[simd_test = "neon"]
+    unsafe fn test_vrsqrt_f32() {
+        let a = f32x2::new(1.0, 2.0);
+        let e = f32x2::new(0.9980469, 0.7050781);
+        let r: f32x2 = vrsqrte_f32(a.into_bits()).into_bits();
         assert_eq!(r, e);
     }
 }
diff --git a/coresimd/arm/v7.rs b/coresimd/arm/v7.rs
index 8e7abef756..aefd7c7465 100644
--- a/coresimd/arm/v7.rs
+++ b/coresimd/arm/v7.rs
@@ -14,38 +14,41 @@ use stdsimd_test::assert_instr;
 
 /// Count Leading Zeros.
 #[inline]
-#[cfg_attr(test, assert_instr(clz))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
 pub unsafe fn _clz_u8(x: u8) -> u8 {
     x.leading_zeros() as u8
 }
 
 /// Count Leading Zeros.
 #[inline]
-#[cfg_attr(test, assert_instr(clz))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
 pub unsafe fn _clz_u16(x: u16) -> u16 {
     x.leading_zeros() as u16
 }
 
 /// Count Leading Zeros.
 #[inline]
-#[cfg_attr(test, assert_instr(clz))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
 pub unsafe fn _clz_u32(x: u32) -> u32 {
     x.leading_zeros() as u32
 }
 
 /// Reverse the bit order.
 #[inline]
-#[cfg_attr(test, assert_instr(rbit))]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc
+#[cfg_attr(test, assert_instr(rbit))]
 pub unsafe fn _rbit_u32(x: u32) -> u32 {
-    rbit_u32(x as i32) as u32
-}
-
-#[allow(dead_code)]
-extern "C" {
-    #[link_name = "llvm.bitreverse.i32"]
-    fn rbit_u32(i: i32) -> i32;
+    use intrinsics::bitreverse;
+    bitreverse(x)
 }
 
 #[cfg(test)]
diff --git a/coresimd/macros.rs b/coresimd/macros.rs
new file mode 100644
index 0000000000..5d28abd41f
--- /dev/null
+++ b/coresimd/macros.rs
@@ -0,0 +1,23 @@
+//! Utility macros.
+
+#[allow(unused)]
+macro_rules! types {
+    ($(
+        $(#[$doc:meta])*
+        pub struct $name:ident($($fields:tt)*);
+    )*) => ($(
+        $(#[$doc])*
+            #[derive(Copy, Debug)]
+        #[allow(non_camel_case_types)]
+        #[repr(simd)]
+        pub struct $name($($fields)*);
+
+        #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))]
+        impl ::clone::Clone for $name {
+            #[inline] // currently needed for correctness
+            fn clone(&self) -> $name {
+                *self
+            }
+        }
+    )*)
+}
diff --git a/coresimd/mod.rs b/coresimd/mod.rs
index 088142d17c..5e4361619e 100644
--- a/coresimd/mod.rs
+++ b/coresimd/mod.rs
@@ -1,5 +1,8 @@
 //! `coresimd`
 
+#[macro_use]
+mod macros;
+
 #[macro_use]
 mod ppsv;
 
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
index 8590ad41f2..840a6af92f 100644
--- a/coresimd/ppsv/mod.rs
+++ b/coresimd/ppsv/mod.rs
@@ -59,18 +59,19 @@ pub trait IntoBits<T>: ::marker::Sized {
     fn into_bits(self) -> T;
 }
 
-// FromBits implies IntoBits
+// FromBits implies IntoBits.
 impl<T, U> IntoBits<U> for T
 where
     U: FromBits<T>,
 {
     #[inline]
     fn into_bits(self) -> U {
+        debug_assert!(::mem::size_of::<Self>() == ::mem::size_of::<U>());
         U::from_bits(self)
     }
 }
 
-// FromBits (and thus IntoBits) is reflexive
+// FromBits (and thus IntoBits) is reflexive.
 impl<T> FromBits<T> for T {
     #[inline]
     fn from_bits(t: Self) -> Self {
diff --git a/coresimd/ppsv/v128.rs b/coresimd/ppsv/v128.rs
index c0beb3ada1..670c819962 100644
--- a/coresimd/ppsv/v128.rs
+++ b/coresimd/ppsv/v128.rs
@@ -78,6 +78,72 @@ simd_f_ty! {
     /// A 128-bit vector with 2 `f64` lanes.
 }
 
+#[cfg(target_arch = "x86")]
+use coresimd::arch::x86::{__m128, __m128d, __m128i};
+#[cfg(target_arch = "x86_64")]
+use coresimd::arch::x86_64::{__m128, __m128d, __m128i};
+
+macro_rules! from_bits_x86 {
+    ($id:ident, $elem_ty:ident, $test_mod:ident) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        impl_from_bits_!($id: __m128, __m128i, __m128d);
+    }
+}
+
+#[cfg(all(target_arch = "arm", target_feature = "v7"))]
+use coresimd::arch::arm::{// FIXME: float16x8_t,
+                          float32x4_t,
+                          int16x8_t,
+                          int32x4_t,
+                          int64x2_t,
+                          int8x16_t,
+                          poly16x8_t,
+                          poly8x16_t,
+                          uint16x8_t,
+                          uint32x4_t,
+                          uint64x2_t,
+                          uint8x16_t};
+
+#[cfg(target_arch = "aarch64")]
+use coresimd::arch::aarch64::{// FIXME: float16x8_t,
+                              float32x4_t,
+                              float64x2_t,
+                              int16x8_t,
+                              int32x4_t,
+                              int64x2_t,
+                              int8x16_t,
+                              poly16x8_t,
+                              poly8x16_t,
+                              uint16x8_t,
+                              uint32x4_t,
+                              uint64x2_t,
+                              uint8x16_t};
+
+macro_rules! from_bits_arm {
+    ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
+        #[cfg(any(all(target_arch = "arm", target_feature = "v7"), target_arch = "aarch64"))]
+        impl_from_bits_!(
+            $id:
+            int8x16_t,
+            uint8x16_t,
+            int16x8_t,
+            uint16x8_t,
+            int32x4_t,
+            uint32x4_t,
+            int64x2_t,
+            uint64x2_t,
+            // FIXME: float16x8_t,
+            float32x4_t,
+            poly8x16_t,
+            poly16x8_t
+        );
+        #[cfg(target_arch = "aarch64")]
+        impl_from_bits_!(
+            $id: float64x2_t
+        );
+    }
+}
+
 impl_from_bits!(
     u64x2: u64,
     u64x2_from_bits,
@@ -92,6 +158,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(u64x2, u64, u64x2_from_bits_x86);
+from_bits_arm!(u64x2, u64, u64x2_from_bits_arm, u64x2_from_bits_aarch64);
+
 impl_from_bits!(
     i64x2: i64,
     i64x2_from_bits,
@@ -106,6 +175,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(i64x2, i64, i64x2_from_bits_x86);
+from_bits_arm!(i64x2, i64, i64x2_from_bits_arm, i64x2_from_bits_aarch64);
+
 impl_from_bits!(
     f64x2: f64,
     f64x2_from_bits,
@@ -120,6 +192,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(f64x2, f64, f64x2_from_bits_x86);
+from_bits_arm!(f64x2, f64, f64x2_from_bits_arm, f64x2_from_bits_aarch64);
+
 impl_from_bits!(
     u32x4: u32,
     u32x4_from_bits,
@@ -134,6 +209,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(u32x4, u32, u32x4_from_bits_x86);
+from_bits_arm!(u32x4, u32, u32x4_from_bits_arm, u32x4_from_bits_aarch64);
+
 impl_from_bits!(
     i32x4: i32,
     i32x4_from_bits,
@@ -148,6 +226,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(i32x4, i32, i32x4_from_bits_x86);
+from_bits_arm!(i32x4, i32, i32x4_from_bits_arm, i32x4_from_bits_aarch64);
+
 impl_from_bits!(
     f32x4: f32,
     f32x4_from_bits,
@@ -162,6 +243,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(f32x4, f32, f32x4_from_bits_x86);
+from_bits_arm!(f32x4, f32, f32x4_from_bits_arm, f32x4_from_bits_aarch64);
+
 impl_from_bits!(
     u16x8: u16,
     u16x8_from_bits,
@@ -176,6 +260,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(u16x8, u16, u16x8_from_bits_x86);
+from_bits_arm!(u16x8, u16, u16x8_from_bits_arm, u16x8_from_bits_aarch64);
+
 impl_from_bits!(
     i16x8: i16,
     i16x8_from_bits,
@@ -190,6 +277,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(i16x8, i16, i16x8_from_bits_x86);
+from_bits_arm!(i16x8, i16, i16x8_from_bits_arm, i16x8_from_bits_aarch64);
+
 impl_from_bits!(
     u8x16: u8,
     u8x16_from_bits,
@@ -204,6 +294,9 @@ impl_from_bits!(
     i8x16,
     b8x16
 );
+from_bits_x86!(u8x16, u8, u8x16_from_bits_x86);
+from_bits_arm!(u8x16, u8, u8x16_from_bits_arm, u8x16_from_bits_aarch64);
+
 impl_from_bits!(
     i8x16: i8,
     i8x16_from_bits,
@@ -218,32 +311,8 @@ impl_from_bits!(
     u8x16,
     b8x16
 );
-
-#[cfg(target_arch = "x86")]
-use coresimd::arch::x86::{__m128, __m128d, __m128i};
-#[cfg(target_arch = "x86_64")]
-use coresimd::arch::x86_64::{__m128, __m128d, __m128i};
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(f64x2: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u64x2: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i64x2: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(f32x4: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u32x4: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i32x4: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u16x8: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i16x8: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u8x16: __m128, __m128i, __m128d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i8x16: __m128, __m128i, __m128d);
+from_bits_x86!(i8x16, i8, i8x16_from_bits_x86);
+from_bits_arm!(i8x16, i8, i8x16_from_bits_arm, i8x16_from_bits_aarch64);
 
 impl_from!(
     f64x2: f64,
diff --git a/coresimd/ppsv/v256.rs b/coresimd/ppsv/v256.rs
index f88a209b96..b2dfc65905 100644
--- a/coresimd/ppsv/v256.rs
+++ b/coresimd/ppsv/v256.rs
@@ -94,6 +94,18 @@ simd_f_ty! {
     /// A 256-bit vector with 4 `f64` lanes.
 }
 
+#[cfg(target_arch = "x86")]
+use coresimd::arch::x86::{__m256, __m256d, __m256i};
+#[cfg(target_arch = "x86_64")]
+use coresimd::arch::x86_64::{__m256, __m256d, __m256i};
+
+macro_rules! from_bits_x86 {
+    ($id:ident, $elem_ty:ident, $test_mod:ident) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        impl_from_bits_!($id: __m256, __m256i, __m256d);
+    }
+}
+
 impl_from_bits!(
     i8x32: i8,
     i8x32_from_bits,
@@ -108,6 +120,8 @@ impl_from_bits!(
     u8x32,
     b8x32
 );
+from_bits_x86!(i8x32, i8, i8x32_from_bits_x86);
+
 impl_from_bits!(
     u8x32: u8,
     u8x32_from_bits,
@@ -122,6 +136,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(u8x32, u8, u8x32_from_bits_x86);
+
 impl_from_bits!(
     i16x16: i16,
     i16x16_from_bits,
@@ -136,6 +152,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(i16x16, i16, i16x16_from_bits_x86);
+
 impl_from_bits!(
     u16x16: u16,
     u16x16_from_bits,
@@ -150,6 +168,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(u16x16, u16, u16x16_from_bits_x86);
+
 impl_from_bits!(
     i32x8: i32,
     i32x8_from_bits,
@@ -164,6 +184,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(i32x8, i32, i32x8_from_bits_x86);
+
 impl_from_bits!(
     u32x8: u32,
     u32x8_from_bits,
@@ -178,6 +200,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(u32x8, u32, u32x8_from_bits_x86);
+
 impl_from_bits!(
     f32x8: f32,
     f32x8_from_bits,
@@ -192,6 +216,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(f32x8, f32, f32x8_from_bits_x86);
+
 impl_from_bits!(
     i64x4: i64,
     i64x4_from_bits,
@@ -206,6 +232,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(i64x4, i64, i64x4_from_bits_x86);
+
 impl_from_bits!(
     u64x4: u64,
     u64x4_from_bits,
@@ -220,6 +248,8 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
+from_bits_x86!(u64x4, u64, u64x4_from_bits_x86);
+
 impl_from_bits!(
     f64x4: f64,
     f64x4_from_bits,
@@ -234,32 +264,7 @@ impl_from_bits!(
     i8x32,
     b8x32
 );
-
-#[cfg(target_arch = "x86")]
-use coresimd::arch::x86::{__m256, __m256d, __m256i};
-#[cfg(target_arch = "x86_64")]
-use coresimd::arch::x86_64::{__m256, __m256d, __m256i};
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(f64x4: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u64x4: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i64x4: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(f32x8: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u32x8: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i32x8: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u16x16: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i16x16: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u8x32: __m256, __m256i, __m256d);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i8x32: __m256, __m256i, __m256d);
+from_bits_x86!(f64x4, f64, f64x4_from_bits_x86);
 
 impl_from!(
     f64x4: f64,
diff --git a/coresimd/ppsv/v64.rs b/coresimd/ppsv/v64.rs
index 6da839861d..a82511dd9e 100644
--- a/coresimd/ppsv/v64.rs
+++ b/coresimd/ppsv/v64.rs
@@ -57,6 +57,73 @@ simd_f_ty! {
     /// A 64-bit vector with 2 `f32` lanes.
 }
 
+#[cfg(target_arch = "x86")]
+use coresimd::arch::x86::__m64;
+
+#[cfg(target_arch = "x86_64")]
+use coresimd::arch::x86_64::__m64;
+
+macro_rules! from_bits_x86 {
+    ($id:ident, $elem_ty:ident, $test_mod:ident) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        impl_from_bits_!($id: __m64);
+    }
+}
+
+#[cfg(all(target_arch = "arm", target_feature = "v7"))]
+use coresimd::arch::arm::{// FIXME: float16x4_t,
+                          float32x2_t,
+                          int16x4_t,
+                          int32x2_t,
+                          int64x1_t,
+                          int8x8_t,
+                          poly16x4_t,
+                          poly8x8_t,
+                          uint16x4_t,
+                          uint32x2_t,
+                          uint64x1_t,
+                          uint8x8_t};
+
+#[cfg(target_arch = "aarch64")]
+use coresimd::arch::aarch64::{// FIXME: float16x4_t,
+                              float32x2_t,
+                              float64x1_t,
+                              int16x4_t,
+                              int32x2_t,
+                              int64x1_t,
+                              int8x8_t,
+                              poly16x4_t,
+                              poly8x8_t,
+                              uint16x4_t,
+                              uint32x2_t,
+                              uint64x1_t,
+                              uint8x8_t};
+
+macro_rules! from_bits_arm {
+    ($id:ident, $elem_ty:ident, $test_mod_arm:ident, $test_mod_a64:ident) => {
+        #[cfg(any(all(target_arch = "arm", target_feature = "v7"), target_arch = "aarch64"))]
+        impl_from_bits_!(
+            $id:
+            int64x1_t,
+            uint64x1_t,
+            uint32x2_t,
+            int32x2_t,
+            float32x2_t,
+            uint16x4_t,
+            int16x4_t,
+            // FIXME: float16x4_t
+            poly16x4_t,
+            uint8x8_t,
+            int8x8_t,
+            poly8x8_t
+        );
+        #[cfg(target_arch = "aarch64")]
+        impl_from_bits_!(
+            $id: float64x1_t
+        );
+    }
+}
+
 impl_from_bits!(
     u32x2: u32,
     u32x2_from_bits,
@@ -68,6 +135,9 @@ impl_from_bits!(
     i8x8,
     b8x8
 );
+from_bits_x86!(u32x2, u32, u32x2_from_bits_x86);
+from_bits_arm!(u32x2, u32, u32x2_from_bits_arm, u32x2_from_bits_aarch64);
+
 impl_from_bits!(
     i32x2: i32,
     i32x2_from_bits,
@@ -79,6 +149,9 @@ impl_from_bits!(
     i8x8,
     b8x8
 );
+from_bits_x86!(i32x2, i32, i32x2_from_bits_x86);
+from_bits_arm!(i32x2, i32, i32x2_from_bits_arm, i32x2_from_bits_aarch64);
+
 impl_from_bits!(
     f32x2: f32,
     f32x2_from_bits,
@@ -90,6 +163,9 @@ impl_from_bits!(
     i8x8,
     b8x8
 );
+from_bits_x86!(f32x2, f32, f32x2_from_bits_x86);
+from_bits_arm!(f32x2, f32, f32x2_from_bits_arm, f32x2_from_bits_aarch64);
+
 impl_from_bits!(
     u16x4: u16,
     u16x4_from_bits,
@@ -100,6 +176,9 @@ impl_from_bits!(
     i8x8,
     b8x8
 );
+from_bits_x86!(u16x4, u16, u16x4_from_bits_x86);
+from_bits_arm!(u16x4, u16, u16x4_from_bits_arm, u16x4_from_bits_aarch64);
+
 impl_from_bits!(
     i16x4: i16,
     i16x4_from_bits,
@@ -110,6 +189,9 @@ impl_from_bits!(
     i8x8,
     b8x8
 );
+from_bits_x86!(i16x4, i16, i16x4_from_bits_x86);
+from_bits_arm!(i16x4, i16, i16x4_from_bits_arm, i16x4_from_bits_aarch64);
+
 impl_from_bits!(
     u8x8: u8,
     u8x8_from_bits,
@@ -120,6 +202,9 @@ impl_from_bits!(
     i8x8,
     b8x8
 );
+from_bits_x86!(u8x8, u8, u8x8_from_bits_x86);
+from_bits_arm!(u8x8, u8, u8x8_from_bits_arm, u8x8_from_bits_aarch64);
+
 impl_from_bits!(
     i8x8: i8,
     i8x8_from_bits,
@@ -130,27 +215,8 @@ impl_from_bits!(
     u8x8,
     b8x8
 );
-
-#[cfg(target_arch = "x86")]
-use coresimd::arch::x86::__m64;
-
-#[cfg(target_arch = "x86_64")]
-use coresimd::arch::x86_64::__m64;
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(f32x2: __m64);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u32x2: __m64);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i32x2: __m64);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u16x4: __m64);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i16x4: __m64);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(u8x8: __m64);
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-impl_from_bits_!(i8x8: __m64);
+from_bits_x86!(i8x8, i8, i8x8_from_bits_x86);
+from_bits_arm!(i8x8, i8, i8x8_from_bits_arm, i8x8_from_bits_aarch64);
 
 impl_from!(
     f32x2: f32,
diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs
index 32915c3329..653e0f8bf7 100644
--- a/coresimd/x86/mod.rs
+++ b/coresimd/x86/mod.rs
@@ -6,27 +6,6 @@ use mem;
 #[macro_use]
 mod macros;
 
-macro_rules! types {
-    ($(
-        $(#[$doc:meta])*
-        pub struct $name:ident($($fields:tt)*);
-    )*) => ($(
-        $(#[$doc])*
-        #[derive(Copy, Debug)]
-        #[allow(non_camel_case_types)]
-        #[repr(simd)]
-        pub struct $name($($fields)*);
-
-        #[cfg_attr(feature = "cargo-clippy", allow(expl_impl_clone_on_copy))]
-        impl Clone for $name {
-            #[inline] // currently needed for correctness
-            fn clone(&self) -> $name {
-                *self
-            }
-        }
-    )*)
-}
-
 types! {
     /// 64-bit wide integer vector type, x86-specific
     ///
@@ -459,12 +438,12 @@ impl m256iExt for __m256i {
     }
 }
 
-use coresimd::simd::{b8x16, b8x32, b8x8, f32x4, f32x8, f64x2, f64x4, i16x16,
-                     i16x4, i16x8, i32x2, i32x4, i32x8, i64x2, i64x4, i8x16,
-                     i8x32, i8x8, u16x16, u16x4, u16x8, u32x2, u32x4, u32x8,
-                     u64x2, u64x4, u8x16, u8x32, u8x8};
+use coresimd::simd::{b8x16, b8x32, b8x8, f32x2, f32x4, f32x8, f64x2, f64x4,
+                     i16x16, i16x4, i16x8, i32x2, i32x4, i32x8, i64x2, i64x4,
+                     i8x16, i8x32, i8x8, u16x16, u16x4, u16x8, u32x2, u32x4,
+                     u32x8, u64x2, u64x4, u8x16, u8x32, u8x8};
 
-impl_from_bits_!(__m64: u32x2, i32x2, u16x4, i16x4, u8x8, i8x8, b8x8);
+impl_from_bits_!(__m64: u32x2, i32x2, f32x2, u16x4, i16x4, u8x8, i8x8, b8x8);
 impl_from_bits_!(
     __m128: u64x2,
     i64x2,
diff --git a/crates/coresimd/tests/v128.rs b/crates/coresimd/tests/v128.rs
index 8eb1e1801c..d880e3a23d 100644
--- a/crates/coresimd/tests/v128.rs
+++ b/crates/coresimd/tests/v128.rs
@@ -1,7 +1,8 @@
 //! coresimd 128-bit wide vector tests
 
 #![cfg_attr(feature = "strict", deny(warnings))]
-#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)]
+#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float,
+           cfg_target_feature)]
 #![allow(unused_imports, dead_code)]
 
 #[cfg(test)]
@@ -30,7 +31,7 @@ macro_rules! vector_impl {
 mod ppsv;
 
 #[cfg(test)]
-use std::marker;
+use std::{marker, mem};
 
 #[cfg(all(test, target_arch = "aarch64"))]
 use std::cmp;
diff --git a/crates/coresimd/tests/v16.rs b/crates/coresimd/tests/v16.rs
index b44c03f281..71d6cdb44f 100644
--- a/crates/coresimd/tests/v16.rs
+++ b/crates/coresimd/tests/v16.rs
@@ -1,7 +1,8 @@
 //! coresimd 16-bit wide vector tests
 
 #![cfg_attr(feature = "strict", deny(warnings))]
-#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)]
+#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float,
+           cfg_target_feature)]
 #![allow(unused_imports, dead_code)]
 
 #[cfg(test)]
@@ -30,7 +31,7 @@ macro_rules! vector_impl {
 mod ppsv;
 
 #[cfg(test)]
-use std::marker;
+use std::{marker, mem};
 
 #[cfg(all(test, target_arch = "aarch64"))]
 use std::cmp;
diff --git a/crates/coresimd/tests/v256.rs b/crates/coresimd/tests/v256.rs
index e4f7416d2d..5149ba0204 100644
--- a/crates/coresimd/tests/v256.rs
+++ b/crates/coresimd/tests/v256.rs
@@ -1,7 +1,8 @@
 //! coresimd 256-bit wide vector tests
 
 #![cfg_attr(feature = "strict", deny(warnings))]
-#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)]
+#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float,
+           cfg_target_feature)]
 #![allow(unused_imports)]
 
 #[cfg(test)]
@@ -30,7 +31,7 @@ macro_rules! vector_impl {
 mod ppsv;
 
 #[cfg(test)]
-use std::marker;
+use std::{marker, mem};
 
 #[cfg(all(test, target_arch = "aarch64"))]
 use std::cmp;
diff --git a/crates/coresimd/tests/v32.rs b/crates/coresimd/tests/v32.rs
index 83991fc8b2..007546833d 100644
--- a/crates/coresimd/tests/v32.rs
+++ b/crates/coresimd/tests/v32.rs
@@ -1,7 +1,8 @@
 //! coresimd 32-bit wide vector tests
 
 #![cfg_attr(feature = "strict", deny(warnings))]
-#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)]
+#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float,
+           cfg_target_feature)]
 #![allow(unused_imports, dead_code)]
 
 #[cfg(test)]
@@ -30,7 +31,7 @@ macro_rules! vector_impl {
 mod ppsv;
 
 #[cfg(test)]
-use std::marker;
+use std::{marker, mem};
 
 #[cfg(all(test, target_arch = "aarch64"))]
 use std::cmp;
diff --git a/crates/coresimd/tests/v512.rs b/crates/coresimd/tests/v512.rs
index 6420ecb68e..31563f6f0d 100644
--- a/crates/coresimd/tests/v512.rs
+++ b/crates/coresimd/tests/v512.rs
@@ -1,7 +1,8 @@
 //! coresimd 512-bit wide vector tests
 
 #![cfg_attr(feature = "strict", deny(warnings))]
-#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)]
+#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float,
+           cfg_target_feature)]
 #![allow(unused_imports)]
 
 #[cfg(test)]
@@ -30,7 +31,7 @@ macro_rules! vector_impl {
 mod ppsv;
 
 #[cfg(test)]
-use std::marker;
+use std::{marker, mem};
 
 #[cfg(all(test, target_arch = "aarch64"))]
 use std::cmp;
diff --git a/crates/coresimd/tests/v64.rs b/crates/coresimd/tests/v64.rs
index 5434b4c5ab..ce27e809e3 100644
--- a/crates/coresimd/tests/v64.rs
+++ b/crates/coresimd/tests/v64.rs
@@ -1,7 +1,8 @@
 //! coresimd 64-bit wide vector tests
 
 #![cfg_attr(feature = "strict", deny(warnings))]
-#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float)]
+#![feature(stdsimd, link_llvm_intrinsics, simd_ffi, core_float,
+           cfg_target_feature)]
 #![allow(unused_imports, dead_code)]
 
 #[cfg(test)]
@@ -30,7 +31,7 @@ macro_rules! vector_impl {
 mod ppsv;
 
 #[cfg(test)]
-use std::marker;
+use std::{marker, mem};
 
 #[cfg(all(test, target_arch = "aarch64"))]
 use std::cmp;
diff --git a/crates/simd-test-macro/src/lib.rs b/crates/simd-test-macro/src/lib.rs
index 366ad7d29e..c3c570a696 100644
--- a/crates/simd-test-macro/src/lib.rs
+++ b/crates/simd-test-macro/src/lib.rs
@@ -77,7 +77,7 @@ pub fn simd_test(
         .expect(&format!("target triple contained no \"-\": {}", target))
     {
         "i686" | "x86_64" | "i586" => "is_x86_feature_detected",
-        "arm" => "is_arm_feature_detected",
+        "arm" | "armv7" => "is_arm_feature_detected",
         "aarch64" => "is_aarch64_feature_detected",
         "powerpc64" => "is_powerpc64_feature_detected",
         "mips" | "mipsel" => {
diff --git a/examples/nbody.rs b/examples/nbody.rs
index 4abe850d8e..b78ce8be67 100644
--- a/examples/nbody.rs
+++ b/examples/nbody.rs
@@ -43,21 +43,21 @@ impl Frsqrt for f64x2 {
             };
             Self::new(u.extract(0), u.extract(1))
         }
-        #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"),
-                  target_feature = "neon"))]
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             #[cfg(target_arch = "arm")]
             use stdsimd::arch::arm::*;
             #[cfg(target_arch = "aarch64")]
             use stdsimd::arch::aarch64::*;
 
-            unsafe { vrsqrte_f32((*self).into()).into() }
+            let t: f32x2 = (*self).into();
+            let t: f32x2 = unsafe { vrsqrte_f32(t.into_bits()).into_bits() };
+            t.into()
         }
         #[cfg(not(any(all(any(target_arch = "x86",
                               target_arch = "x86_64"),
                           target_feature = "sse"),
-                      all(any(target_arch = "arm",
-                              target_arch = "aarch64"),
+                      all(target_arch = "aarch64",
                           target_feature = "neon"))))]
         {
             self.replace(0, 1. / self.extract(0).sqrt());
diff --git a/stdsimd/arch/detect/arch/arm.rs b/stdsimd/arch/detect/arch/arm.rs
index 852e9b8d2c..08b6a7e87e 100644
--- a/stdsimd/arch/detect/arch/arm.rs
+++ b/stdsimd/arch/detect/arch/arm.rs
@@ -11,6 +11,10 @@ macro_rules! is_arm_feature_detected {
         cfg!(target_feature = "pmull") ||
             $crate::arch::detect::check_for($crate::arch::detect::Feature::pmull)
     };
+    ("v7") => { compile_error!("\"v7\" feature cannot be detected at run-time") };
+    ("vfp2") => { compile_error!("\"vfp2\" feature cannot be detected at run-time") };
+    ("vfp3") => { compile_error!("\"vfp3\" feature cannot be detected at run-time") };
+    ("vfp4") => { compile_error!("\"vfp4\" feature cannot be detected at run-time") };
     ($t:tt) => { compile_error!(concat!("unknown arm target feature: ", $t)) };
 }