From a0db30bb6e03e64c981678473455747abf7c7551 Mon Sep 17 00:00:00 2001 From: Ioana Ghiban Date: Tue, 16 Apr 2024 18:41:19 +0200 Subject: [PATCH] Add NEON intrinsics using float --- kleidicv/include/kleidicv/neon.h | 10 + kleidicv/include/kleidicv/neon_intrinsics.h | 236 ++++++++++---------- 2 files changed, 134 insertions(+), 112 deletions(-) diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h index 68c9d9eb9..7339eb8ee 100644 --- a/kleidicv/include/kleidicv/neon.h +++ b/kleidicv/include/kleidicv/neon.h @@ -134,6 +134,16 @@ class VectorTypes { using Vector4Type = uint64x2x4_t; }; // end of class VectorTypes +template <> +class VectorTypes { + public: + using ScalarType = float32_t; + using VectorType = float32x4_t; + using Vector2Type = float32x4x2_t; + using Vector3Type = float32x4x3_t; + using Vector4Type = float32x4x4_t; +}; // end of class VectorTypes + // Base class for all NEON vector traits. template class VecTraitsBase : public VectorTypes { diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h index b90ee2e4d..22ef062f1 100644 --- a/kleidicv/include/kleidicv/neon_intrinsics.h +++ b/kleidicv/include/kleidicv/neon_intrinsics.h @@ -73,8 +73,8 @@ NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip2q); // vabd* // ----------------------------------------------------------------------------- -static inline int8x16_t vabdq(int8x16_t lhs, int8x16_t rhs) { return vabdq_s8(lhs, rhs); } -static inline uint8x16_t vabdq(uint8x16_t lhs, uint8x16_t rhs) { return vabdq_u8(lhs, rhs); } +static inline int8x16_t vabdq(int8x16_t lhs, int8x16_t rhs) { return vabdq_s8(lhs, rhs); } +static inline uint8x16_t vabdq(uint8x16_t lhs, uint8x16_t rhs) { return vabdq_u8(lhs, rhs); } static inline int16x8_t vabdq(int16x8_t lhs, int16x8_t rhs) { return vabdq_s16(lhs, rhs); } static inline uint16x8_t vabdq(uint16x8_t lhs, uint16x8_t rhs) { return vabdq_u16(lhs, rhs); } static inline int32x4_t vabdq(int32x4_t lhs, int32x4_t rhs) { return vabdq_s32(lhs, rhs); } @@ -84,17 +84,17 @@ static inline uint32x4_t vabdq(uint32x4_t lhs, uint32x4_t rhs) { return vabdq_u3 // vqabs* // ----------------------------------------------------------------------------- -static inline int8x16_t vqabsq(int8x16_t vec) { return vqabsq_s8(vec); } +static inline int8x16_t vqabsq(int8x16_t vec) { return vqabsq_s8(vec); } static inline int16x8_t vqabsq(int16x8_t vec) { return vqabsq_s16(vec); } static inline int32x4_t vqabsq(int32x4_t vec) { return vqabsq_s32(vec); } static inline int64x2_t vqabsq(int64x2_t vec) { return vqabsq_s64(vec); } // ----------------------------------------------------------------------------- -// vabd* +// vaddl* // ----------------------------------------------------------------------------- -static inline int16x8_t vaddl(int8x8_t lhs, int8x8_t rhs) { return vaddl_s8(lhs, rhs); } -static inline uint16x8_t vaddl(uint8x8_t lhs, uint8x8_t rhs) { return vaddl_u8(lhs, rhs); } +static inline int16x8_t vaddl(int8x8_t lhs, int8x8_t rhs) { return vaddl_s8(lhs, rhs); } +static inline uint16x8_t vaddl(uint8x8_t lhs, uint8x8_t rhs) { return vaddl_u8(lhs, rhs); } static inline int32x4_t vaddl(int16x4_t lhs, int16x4_t rhs) { return vaddl_s16(lhs, rhs); } static inline uint32x4_t vaddl(uint16x4_t lhs, uint16x4_t rhs) { return vaddl_u16(lhs, rhs); } static inline int64x2_t vaddl(int32x2_t lhs, int32x2_t rhs) { return vaddl_s32(lhs, rhs); } @@ -104,8 +104,8 @@ static inline uint64x2_t vaddl(uint32x2_t lhs, uint32x2_t rhs) { return vaddl_u3 // vget_high* // ----------------------------------------------------------------------------- -static inline int8x8_t vget_high(int8x16_t vec) { return vget_high_s8(vec); } -static inline uint8x8_t vget_high(uint8x16_t vec) { return vget_high_u8(vec); } +static inline int8x8_t vget_high(int8x16_t vec) { return vget_high_s8(vec); } +static inline uint8x8_t vget_high(uint8x16_t vec) { return vget_high_u8(vec); } static inline int16x4_t vget_high(int16x8_t vec) { return vget_high_s16(vec); } static inline uint16x4_t vget_high(uint16x8_t vec) { return vget_high_u16(vec); } static inline int32x2_t vget_high(int32x4_t vec) { return vget_high_s32(vec); } @@ -117,8 +117,8 @@ static inline uint64x1_t vget_high(uint64x2_t vec) { return vget_high_u64(vec); // vget_low* // ----------------------------------------------------------------------------- -static inline int8x8_t vget_low(int8x16_t vec) { return vget_low_s8(vec); } -static inline uint8x8_t vget_low(uint8x16_t vec) { return vget_low_u8(vec); } +static inline int8x8_t vget_low(int8x16_t vec) { return vget_low_s8(vec); } +static inline uint8x8_t vget_low(uint8x16_t vec) { return vget_low_u8(vec); } static inline int16x4_t vget_low(int16x8_t vec) { return vget_low_s16(vec); } static inline uint16x4_t vget_low(uint16x8_t vec) { return vget_low_u16(vec); } static inline int32x2_t vget_low(int32x4_t vec) { return vget_low_s32(vec); } @@ -130,45 +130,49 @@ static inline uint64x1_t vget_low(uint64x2_t vec) { return vget_low_u64(vec); } // vminq* // ----------------------------------------------------------------------------- -static inline int8x16_t vminq(int8x16_t lhs, int8x16_t rhs) { return vminq_s8(lhs, rhs); } -static inline uint8x16_t vminq(uint8x16_t lhs, uint8x16_t rhs) { return vminq_u8(lhs, rhs); } -static inline int16x8_t vminq(int16x8_t lhs, int16x8_t rhs) { return vminq_s16(lhs, rhs); } -static inline uint16x8_t vminq(uint16x8_t lhs, uint16x8_t rhs) { return vminq_u16(lhs, rhs); } -static inline int32x4_t vminq(int32x4_t lhs, int32x4_t rhs) { return vminq_s32(lhs, rhs); } -static inline uint32x4_t vminq(uint32x4_t lhs, uint32x4_t rhs) { return vminq_u32(lhs, rhs); } +static inline int8x16_t vminq(int8x16_t lhs, int8x16_t rhs) { return vminq_s8(lhs, rhs); } +static inline uint8x16_t vminq(uint8x16_t lhs, uint8x16_t rhs) { return vminq_u8(lhs, rhs); } +static inline int16x8_t vminq(int16x8_t lhs, int16x8_t rhs) { return vminq_s16(lhs, rhs); } +static inline uint16x8_t vminq(uint16x8_t lhs, uint16x8_t rhs) { return vminq_u16(lhs, rhs); } +static inline int32x4_t vminq(int32x4_t lhs, int32x4_t rhs) { return vminq_s32(lhs, rhs); } +static inline uint32x4_t vminq(uint32x4_t lhs, uint32x4_t rhs) { return vminq_u32(lhs, rhs); } +static inline float32x4_t vminq(float32x4_t lhs, float32x4_t rhs) { return vminq_f32(lhs, rhs); } // ----------------------------------------------------------------------------- // vmaxq* // ----------------------------------------------------------------------------- -static inline int8x16_t vmaxq(int8x16_t lhs, int8x16_t rhs) { return vmaxq_s8(lhs, rhs); } -static inline uint8x16_t vmaxq(uint8x16_t lhs, uint8x16_t rhs) { return vmaxq_u8(lhs, rhs); } -static inline int16x8_t vmaxq(int16x8_t lhs, int16x8_t rhs) { return vmaxq_s16(lhs, rhs); } -static inline uint16x8_t vmaxq(uint16x8_t lhs, uint16x8_t rhs) { return vmaxq_u16(lhs, rhs); } -static inline int32x4_t vmaxq(int32x4_t lhs, int32x4_t rhs) { return vmaxq_s32(lhs, rhs); } -static inline uint32x4_t vmaxq(uint32x4_t lhs, uint32x4_t rhs) { return vmaxq_u32(lhs, rhs); } +static inline int8x16_t vmaxq(int8x16_t lhs, int8x16_t rhs) { return vmaxq_s8(lhs, rhs); } +static inline uint8x16_t vmaxq(uint8x16_t lhs, uint8x16_t rhs) { return vmaxq_u8(lhs, rhs); } +static inline int16x8_t vmaxq(int16x8_t lhs, int16x8_t rhs) { return vmaxq_s16(lhs, rhs); } +static inline uint16x8_t vmaxq(uint16x8_t lhs, uint16x8_t rhs) { return vmaxq_u16(lhs, rhs); } +static inline int32x4_t vmaxq(int32x4_t lhs, int32x4_t rhs) { return vmaxq_s32(lhs, rhs); } +static inline uint32x4_t vmaxq(uint32x4_t lhs, uint32x4_t rhs) { return vmaxq_u32(lhs, rhs); } +static inline float32x4_t vmaxq(float32x4_t lhs, float32x4_t rhs) { return vmaxq_f32(lhs, rhs); } // ----------------------------------------------------------------------------- // vminvq* // ----------------------------------------------------------------------------- -static inline int8_t vminvq(int8x16_t src) { return vminvq_s8(src); } -static inline uint8_t vminvq(uint8x16_t src) { return vminvq_u8(src); } -static inline int16_t vminvq(int16x8_t src) { return vminvq_s16(src); } -static inline uint16_t vminvq(uint16x8_t src) { return vminvq_u16(src); } -static inline int32_t vminvq(int32x4_t src) { return vminvq_s32(src); } -static inline uint32_t vminvq(uint32x4_t src) { return vminvq_u32(src); } +static inline int8_t vminvq(int8x16_t src) { return vminvq_s8(src); } +static inline uint8_t vminvq(uint8x16_t src) { return vminvq_u8(src); } +static inline int16_t vminvq(int16x8_t src) { return vminvq_s16(src); } +static inline uint16_t vminvq(uint16x8_t src) { return vminvq_u16(src); } +static inline int32_t vminvq(int32x4_t src) { return vminvq_s32(src); } +static inline uint32_t vminvq(uint32x4_t src) { return vminvq_u32(src); } +static inline float32_t vminvq(float32x4_t src) { return vminvq_f32(src); } // ----------------------------------------------------------------------------- // vmaxvq* // ----------------------------------------------------------------------------- -static inline int8_t vmaxvq(int8x16_t src) { return vmaxvq_s8(src); } -static inline uint8_t vmaxvq(uint8x16_t src) { return vmaxvq_u8(src); } -static inline int16_t vmaxvq(int16x8_t src) { return vmaxvq_s16(src); } -static inline uint16_t vmaxvq(uint16x8_t src) { return vmaxvq_u16(src); } -static inline int32_t vmaxvq(int32x4_t src) { return vmaxvq_s32(src); } -static inline uint32_t vmaxvq(uint32x4_t src) { return vmaxvq_u32(src); } +static inline int8_t vmaxvq(int8x16_t src) { return vmaxvq_s8(src); } +static inline uint8_t vmaxvq(uint8x16_t src) { return vmaxvq_u8(src); } +static inline int16_t vmaxvq(int16x8_t src) { return vmaxvq_s16(src); } +static inline uint16_t vmaxvq(uint16x8_t src) { return vmaxvq_u16(src); } +static inline int32_t vmaxvq(int32x4_t src) { return vmaxvq_s32(src); } +static inline uint32_t vmaxvq(uint32x4_t src) { return vmaxvq_u32(src); } +static inline float32_t vmaxvq(float32x4_t src) { return vmaxvq_f32(src); } // ----------------------------------------------------------------------------- // vrshrn_n* @@ -185,8 +189,8 @@ template static inline uint32x2_t vrshrn_n(uint64x2_t vec) { return vrsh // vshll_n* // ----------------------------------------------------------------------------- -template static inline int16x8_t vshll_n(int8x8_t vec) { return vshll_n_s8(vec, n); } -template static inline uint16x8_t vshll_n(uint8x8_t vec) { return vshll_n_u8(vec, n); } +template static inline int16x8_t vshll_n(int8x8_t vec) { return vshll_n_s8(vec, n); } +template static inline uint16x8_t vshll_n(uint8x8_t vec) { return vshll_n_u8(vec, n); } template static inline int32x4_t vshll_n(int16x4_t vec) { return vshll_n_s16(vec, n); } template static inline uint32x4_t vshll_n(uint16x4_t vec) { return vshll_n_u16(vec, n); } template static inline int64x2_t vshll_n(int32x2_t vec) { return vshll_n_s32(vec, n); } @@ -196,8 +200,8 @@ template static inline uint64x2_t vshll_n(uint32x2_t vec) { return vshll // vshlq_n* // ----------------------------------------------------------------------------- -template static inline int8x16_t vshlq_n(int8x16_t vec) { return vshlq_n_s8(vec, n); } -template static inline uint8x16_t vshlq_n(uint8x16_t vec) { return vshlq_n_u8(vec, n); } +template static inline int8x16_t vshlq_n(int8x16_t vec) { return vshlq_n_s8(vec, n); } +template static inline uint8x16_t vshlq_n(uint8x16_t vec) { return vshlq_n_u8(vec, n); } template static inline int16x8_t vshlq_n(int16x8_t vec) { return vshlq_n_s16(vec, n); } template static inline uint16x8_t vshlq_n(uint16x8_t vec) { return vshlq_n_u16(vec, n); } template static inline int32x4_t vshlq_n(int32x4_t vec) { return vshlq_n_s32(vec, n); } @@ -209,21 +213,22 @@ template static inline uint64x2_t vshlq_n(uint64x2_t vec) { return vshlq // vdupq* // ----------------------------------------------------------------------------- -static inline int8x16_t vdupq_n(int8_t src) { return vdupq_n_s8(src); } -static inline uint8x16_t vdupq_n(uint8_t src) { return vdupq_n_u8(src); } -static inline int16x8_t vdupq_n(int16_t src) { return vdupq_n_s16(src); } -static inline uint16x8_t vdupq_n(uint16_t src) { return vdupq_n_u16(src); } -static inline int32x4_t vdupq_n(int32_t src) { return vdupq_n_s32(src); } -static inline uint32x4_t vdupq_n(uint32_t src) { return vdupq_n_u32(src); } -static inline int64x2_t vdupq_n(int64_t src) { return vdupq_n_s64(src); } -static inline uint64x2_t vdupq_n(uint64_t src) { return vdupq_n_u64(src); } +static inline int8x16_t vdupq_n(int8_t src) { return vdupq_n_s8(src); } +static inline uint8x16_t vdupq_n(uint8_t src) { return vdupq_n_u8(src); } +static inline int16x8_t vdupq_n(int16_t src) { return vdupq_n_s16(src); } +static inline uint16x8_t vdupq_n(uint16_t src) { return vdupq_n_u16(src); } +static inline int32x4_t vdupq_n(int32_t src) { return vdupq_n_s32(src); } +static inline uint32x4_t vdupq_n(uint32_t src) { return vdupq_n_u32(src); } +static inline int64x2_t vdupq_n(int64_t src) { return vdupq_n_s64(src); } +static inline uint64x2_t vdupq_n(uint64_t src) { return vdupq_n_u64(src); } +static inline float32x4_t vdupq_n(float32_t src) { return vdupq_n_f32(src); } // ----------------------------------------------------------------------------- // vmull* // ----------------------------------------------------------------------------- -static inline int16x8_t vmull(int8x8_t lhs, int8x8_t rhs) { return vmull_s8(lhs, rhs); } -static inline uint16x8_t vmull(uint8x8_t lhs, uint8x8_t rhs) { return vmull_u8(lhs, rhs); } +static inline int16x8_t vmull(int8x8_t lhs, int8x8_t rhs) { return vmull_s8(lhs, rhs); } +static inline uint16x8_t vmull(uint8x8_t lhs, uint8x8_t rhs) { return vmull_u8(lhs, rhs); } static inline int32x4_t vmull(int16x4_t lhs, int16x4_t rhs) { return vmull_s16(lhs, rhs); } static inline uint32x4_t vmull(uint16x4_t lhs, uint16x4_t rhs) { return vmull_u16(lhs, rhs); } static inline int64x2_t vmull(int32x2_t lhs, int32x2_t rhs) { return vmull_s32(lhs, rhs); } @@ -232,8 +237,8 @@ static inline int64x2_t vmull(int32x2_t lhs, int32x2_t rhs) { return vmull_s3 // vmull_high* // ----------------------------------------------------------------------------- -static inline int16x8_t vmull_high(int8x16_t lhs, int8x16_t rhs) { return vmull_high_s8(lhs, rhs); } -static inline uint16x8_t vmull_high(uint8x16_t lhs, uint8x16_t rhs) { return vmull_high_u8(lhs, rhs); } +static inline int16x8_t vmull_high(int8x16_t lhs, int8x16_t rhs) { return vmull_high_s8(lhs, rhs); } +static inline uint16x8_t vmull_high(uint8x16_t lhs, uint8x16_t rhs) { return vmull_high_u8(lhs, rhs); } static inline int32x4_t vmull_high(int16x8_t lhs, int16x8_t rhs) { return vmull_high_s16(lhs, rhs); } static inline uint32x4_t vmull_high(uint16x8_t lhs, uint16x8_t rhs) { return vmull_high_u16(lhs, rhs); } static inline int64x2_t vmull_high(int32x4_t lhs, int32x4_t rhs) { return vmull_high_s32(lhs, rhs); } @@ -262,68 +267,75 @@ static inline int32x4_t vqmovn_high(int32x2_t low, int64x2_t src) { return vq // NEON load operations // ----------------------------------------------------------------------------- -static inline int8x16_t vld1q(const int8_t *src) { return vld1q_s8(src); } -static inline uint8x16_t vld1q(const uint8_t *src) { return vld1q_u8(src); } -static inline int16x8_t vld1q(const int16_t *src) { return vld1q_s16(src); } -static inline uint16x8_t vld1q(const uint16_t *src) { return vld1q_u16(src); } -static inline int32x4_t vld1q(const int32_t *src) { return vld1q_s32(src); } -static inline uint32x4_t vld1q(const uint32_t *src) { return vld1q_u32(src); } -static inline int64x2_t vld1q(const int64_t *src) { return vld1q_s64(src); } -static inline uint64x2_t vld1q(const uint64_t *src) { return vld1q_u64(src); } - -static inline int8x16x2_t vld2q(const int8_t *src) { return vld2q_s8(src); } -static inline uint8x16x2_t vld2q(const uint8_t *src) { return vld2q_u8(src); } -static inline int16x8x2_t vld2q(const int16_t *src) { return vld2q_s16(src); } -static inline uint16x8x2_t vld2q(const uint16_t *src) { return vld2q_u16(src); } -static inline int32x4x2_t vld2q(const int32_t *src) { return vld2q_s32(src); } -static inline uint32x4x2_t vld2q(const uint32_t *src) { return vld2q_u32(src); } -static inline int64x2x2_t vld2q(const int64_t *src) { return vld2q_s64(src); } -static inline uint64x2x2_t vld2q(const uint64_t *src) { return vld2q_u64(src); } - -static inline int8x16x3_t vld3q(const int8_t *src) { return vld3q_s8(src); } -static inline uint8x16x3_t vld3q(const uint8_t *src) { return vld3q_u8(src); } -static inline int16x8x3_t vld3q(const int16_t *src) { return vld3q_s16(src); } -static inline uint16x8x3_t vld3q(const uint16_t *src) { return vld3q_u16(src); } -static inline int32x4x3_t vld3q(const int32_t *src) { return vld3q_s32(src); } -static inline uint32x4x3_t vld3q(const uint32_t *src) { return vld3q_u32(src); } -static inline int64x2x3_t vld3q(const int64_t *src) { return vld3q_s64(src); } -static inline uint64x2x3_t vld3q(const uint64_t *src) { return vld3q_u64(src); } - -static inline int8x16x4_t vld4q(const int8_t *src) { return vld4q_s8(src); } -static inline uint8x16x4_t vld4q(const uint8_t *src) { return vld4q_u8(src); } -static inline int16x8x4_t vld4q(const int16_t *src) { return vld4q_s16(src); } -static inline uint16x8x4_t vld4q(const uint16_t *src) { return vld4q_u16(src); } -static inline int32x4x4_t vld4q(const int32_t *src) { return vld4q_s32(src); } -static inline uint32x4x4_t vld4q(const uint32_t *src) { return vld4q_u32(src); } -static inline int64x2x4_t vld4q(const int64_t *src) { return vld4q_s64(src); } -static inline uint64x2x4_t vld4q(const uint64_t *src) { return vld4q_u64(src); } - -static inline int8x16x2_t vld1q_x2(const int8_t *src) { return vld1q_s8_x2(src); } -static inline uint8x16x2_t vld1q_x2(const uint8_t *src) { return vld1q_u8_x2(src); } -static inline int16x8x2_t vld1q_x2(const int16_t *src) { return vld1q_s16_x2(src); } -static inline uint16x8x2_t vld1q_x2(const uint16_t *src) { return vld1q_u16_x2(src); } -static inline int32x4x2_t vld1q_x2(const int32_t *src) { return vld1q_s32_x2(src); } -static inline uint32x4x2_t vld1q_x2(const uint32_t *src) { return vld1q_u32_x2(src); } -static inline int64x2x2_t vld1q_x2(const int64_t *src) { return vld1q_s64_x2(src); } -static inline uint64x2x2_t vld1q_x2(const uint64_t *src) { return vld1q_u64_x2(src); } - -static inline int8x16x3_t vld1q_x3(const int8_t *src) { return vld1q_s8_x3(src); } -static inline uint8x16x3_t vld1q_x3(const uint8_t *src) { return vld1q_u8_x3(src); } -static inline int16x8x3_t vld1q_x3(const int16_t *src) { return vld1q_s16_x3(src); } -static inline uint16x8x3_t vld1q_x3(const uint16_t *src) { return vld1q_u16_x3(src); } -static inline int32x4x3_t vld1q_x3(const int32_t *src) { return vld1q_s32_x3(src); } -static inline uint32x4x3_t vld1q_x3(const uint32_t *src) { return vld1q_u32_x3(src); } -static inline int64x2x3_t vld1q_x3(const int64_t *src) { return vld1q_s64_x3(src); } -static inline uint64x2x3_t vld1q_x3(const uint64_t *src) { return vld1q_u64_x3(src); } - -static inline int8x16x4_t vld1q_x4(const int8_t *src) { return vld1q_s8_x4(src); } -static inline uint8x16x4_t vld1q_x4(const uint8_t *src) { return vld1q_u8_x4(src); } -static inline int16x8x4_t vld1q_x4(const int16_t *src) { return vld1q_s16_x4(src); } -static inline uint16x8x4_t vld1q_x4(const uint16_t *src) { return vld1q_u16_x4(src); } -static inline int32x4x4_t vld1q_x4(const int32_t *src) { return vld1q_s32_x4(src); } -static inline uint32x4x4_t vld1q_x4(const uint32_t *src) { return vld1q_u32_x4(src); } -static inline int64x2x4_t vld1q_x4(const int64_t *src) { return vld1q_s64_x4(src); } -static inline uint64x2x4_t vld1q_x4(const uint64_t *src) { return vld1q_u64_x4(src); } +static inline int8x16_t vld1q(const int8_t *src) { return vld1q_s8(src); } +static inline uint8x16_t vld1q(const uint8_t *src) { return vld1q_u8(src); } +static inline int16x8_t vld1q(const int16_t *src) { return vld1q_s16(src); } +static inline uint16x8_t vld1q(const uint16_t *src) { return vld1q_u16(src); } +static inline int32x4_t vld1q(const int32_t *src) { return vld1q_s32(src); } +static inline uint32x4_t vld1q(const uint32_t *src) { return vld1q_u32(src); } +static inline int64x2_t vld1q(const int64_t *src) { return vld1q_s64(src); } +static inline uint64x2_t vld1q(const uint64_t *src) { return vld1q_u64(src); } +static inline float32x4_t vld1q(const float32_t *src) { return vld1q_f32(src); } + +static inline int8x16x2_t vld2q(const int8_t *src) { return vld2q_s8(src); } +static inline uint8x16x2_t vld2q(const uint8_t *src) { return vld2q_u8(src); } +static inline int16x8x2_t vld2q(const int16_t *src) { return vld2q_s16(src); } +static inline uint16x8x2_t vld2q(const uint16_t *src) { return vld2q_u16(src); } +static inline int32x4x2_t vld2q(const int32_t *src) { return vld2q_s32(src); } +static inline uint32x4x2_t vld2q(const uint32_t *src) { return vld2q_u32(src); } +static inline int64x2x2_t vld2q(const int64_t *src) { return vld2q_s64(src); } +static inline uint64x2x2_t vld2q(const uint64_t *src) { return vld2q_u64(src); } +static inline float32x4x2_t vld2q(const float32_t *src) { return vld2q_f32(src); } + +static inline int8x16x3_t vld3q(const int8_t *src) { return vld3q_s8(src); } +static inline uint8x16x3_t vld3q(const uint8_t *src) { return vld3q_u8(src); } +static inline int16x8x3_t vld3q(const int16_t *src) { return vld3q_s16(src); } +static inline uint16x8x3_t vld3q(const uint16_t *src) { return vld3q_u16(src); } +static inline int32x4x3_t vld3q(const int32_t *src) { return vld3q_s32(src); } +static inline uint32x4x3_t vld3q(const uint32_t *src) { return vld3q_u32(src); } +static inline int64x2x3_t vld3q(const int64_t *src) { return vld3q_s64(src); } +static inline uint64x2x3_t vld3q(const uint64_t *src) { return vld3q_u64(src); } +static inline float32x4x3_t vld3q(const float32_t *src) { return vld3q_f32(src); } + +static inline int8x16x4_t vld4q(const int8_t *src) { return vld4q_s8(src); } +static inline uint8x16x4_t vld4q(const uint8_t *src) { return vld4q_u8(src); } +static inline int16x8x4_t vld4q(const int16_t *src) { return vld4q_s16(src); } +static inline uint16x8x4_t vld4q(const uint16_t *src) { return vld4q_u16(src); } +static inline int32x4x4_t vld4q(const int32_t *src) { return vld4q_s32(src); } +static inline uint32x4x4_t vld4q(const uint32_t *src) { return vld4q_u32(src); } +static inline int64x2x4_t vld4q(const int64_t *src) { return vld4q_s64(src); } +static inline uint64x2x4_t vld4q(const uint64_t *src) { return vld4q_u64(src); } +static inline float32x4x4_t vld4q(const float32_t *src) { return vld4q_f32(src); } + +static inline int8x16x2_t vld1q_x2(const int8_t *src) { return vld1q_s8_x2(src); } +static inline uint8x16x2_t vld1q_x2(const uint8_t *src) { return vld1q_u8_x2(src); } +static inline int16x8x2_t vld1q_x2(const int16_t *src) { return vld1q_s16_x2(src); } +static inline uint16x8x2_t vld1q_x2(const uint16_t *src) { return vld1q_u16_x2(src); } +static inline int32x4x2_t vld1q_x2(const int32_t *src) { return vld1q_s32_x2(src); } +static inline uint32x4x2_t vld1q_x2(const uint32_t *src) { return vld1q_u32_x2(src); } +static inline int64x2x2_t vld1q_x2(const int64_t *src) { return vld1q_s64_x2(src); } +static inline uint64x2x2_t vld1q_x2(const uint64_t *src) { return vld1q_u64_x2(src); } +static inline float32x4x2_t vld1q_x2(const float32_t *src) { return vld1q_f32_x2(src); } + +static inline int8x16x3_t vld1q_x3(const int8_t *src) { return vld1q_s8_x3(src); } +static inline uint8x16x3_t vld1q_x3(const uint8_t *src) { return vld1q_u8_x3(src); } +static inline int16x8x3_t vld1q_x3(const int16_t *src) { return vld1q_s16_x3(src); } +static inline uint16x8x3_t vld1q_x3(const uint16_t *src) { return vld1q_u16_x3(src); } +static inline int32x4x3_t vld1q_x3(const int32_t *src) { return vld1q_s32_x3(src); } +static inline uint32x4x3_t vld1q_x3(const uint32_t *src) { return vld1q_u32_x3(src); } +static inline int64x2x3_t vld1q_x3(const int64_t *src) { return vld1q_s64_x3(src); } +static inline uint64x2x3_t vld1q_x3(const uint64_t *src) { return vld1q_u64_x3(src); } +static inline float32x4x3_t vld1q_x3(const float32_t *src) { return vld1q_f32_x3(src); } + +static inline int8x16x4_t vld1q_x4(const int8_t *src) { return vld1q_s8_x4(src); } +static inline uint8x16x4_t vld1q_x4(const uint8_t *src) { return vld1q_u8_x4(src); } +static inline int16x8x4_t vld1q_x4(const int16_t *src) { return vld1q_s16_x4(src); } +static inline uint16x8x4_t vld1q_x4(const uint16_t *src) { return vld1q_u16_x4(src); } +static inline int32x4x4_t vld1q_x4(const int32_t *src) { return vld1q_s32_x4(src); } +static inline uint32x4x4_t vld1q_x4(const uint32_t *src) { return vld1q_u32_x4(src); } +static inline int64x2x4_t vld1q_x4(const int64_t *src) { return vld1q_s64_x4(src); } +static inline uint64x2x4_t vld1q_x4(const uint64_t *src) { return vld1q_u64_x4(src); } +static inline float32x4x4_t vld1q_x4(const float32_t *src) { return vld1q_f32_x4(src); } // ----------------------------------------------------------------------------- // NEON store operations -- GitLab