From 521f705df6c689f9ea5157c684eedeba9c41604b Mon Sep 17 00:00:00 2001 From: Luna Lamb Date: Wed, 19 Feb 2025 17:14:30 +0000 Subject: [PATCH 1/4] Remove continuous multivec LS instructions on GCC <= 11 Continuous loads and store instructions produce sub optimal code generation on GCC version <= 11, these instructions are not supported on GCC version <=8. This commit ensures non-continuous load and store instructions are used instead, when the library is compiled with GCC11 and prior. --- kleidicv/CMakeLists.txt | 8 +- kleidicv/include/kleidicv/config.h.in | 4 +- .../filters/separable_filter_3x3_neon.h | 21 +- kleidicv/include/kleidicv/neon.h | 292 +++++++++++++++++- kleidicv/include/kleidicv/neon_intrinsics.h | 52 +--- kleidicv/src/arithmetics/in_range_neon.cpp | 6 +- kleidicv/src/arithmetics/scale_neon.cpp | 18 +- kleidicv/src/conversions/float_conv_neon.cpp | 8 +- kleidicv/src/conversions/gray_to_rgb_neon.cpp | 17 +- kleidicv/src/conversions/merge_neon.cpp | 39 ++- kleidicv/src/conversions/rgb_to_rgb_neon.cpp | 12 +- kleidicv/src/conversions/rgb_to_yuv_neon.cpp | 5 +- .../src/conversions/yuv_sp_to_rgb_neon.cpp | 12 +- kleidicv/src/filters/gaussian_blur_neon.cpp | 12 +- kleidicv/src/resize/resize_linear_neon.cpp | 24 +- .../src/transform/remap_s16point5_neon.cpp | 6 +- 16 files changed, 411 insertions(+), 125 deletions(-) diff --git a/kleidicv/CMakeLists.txt b/kleidicv/CMakeLists.txt index 0b0fa2c81..d152093c6 100644 --- a/kleidicv/CMakeLists.txt +++ b/kleidicv/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 @@ -51,6 +51,12 @@ option(KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE "Internal - If turned ON interlea option(KLEIDICV_EXPERIMENTAL_FEATURE_CANNY "Internal - Enable experimental Canny algorithm" OFF) option(KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV "Internal - If turned ON Canny algorithm creates bit exact result compared to OpenCV's original implementation" ON) +if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11 OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS ON) +else() + set(KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS OFF) +endif() + if(KLEIDICV_ENABLE_SME2 AND NOT KLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS) set(KLEIDICV_ALWAYS_ENABLE_SME2 ON) endif() diff --git a/kleidicv/include/kleidicv/config.h.in b/kleidicv/include/kleidicv/config.h.in index 18a0970f3..39930f9b3 100644 --- a/kleidicv/include/kleidicv/config.h.in +++ b/kleidicv/include/kleidicv/config.h.in @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -21,6 +21,8 @@ #cmakedefine01 KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV +#cmakedefine01 KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS + // Set to '1' if compiling NEON code paths, otherwise it is set to '0'. #ifndef KLEIDICV_TARGET_NEON #define KLEIDICV_TARGET_NEON 0 diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h index 3fecea047..a461facad 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h @@ -1,10 +1,11 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 #ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H #define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H +#include "kleidicv/config.h" #include "kleidicv/neon.h" #include "kleidicv/workspace/border_3x3.h" @@ -45,9 +46,12 @@ class SeparableFilter { auto src_1 = &src_rows.at(border_offsets.c1())[index]; auto src_2 = &src_rows.at(border_offsets.c2())[index]; - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); + typename SourceVecTraits::Vector2Type src_0_x2; + SourceVecTraits::load(&src_0[0], src_0_x2); + typename SourceVecTraits::Vector2Type src_1_x2; + SourceVecTraits::load(&src_1[0], src_1_x2); + typename SourceVecTraits::Vector2Type src_2_x2; + SourceVecTraits::load(&src_2[0], src_2_x2); SourceVectorType src_a[3], src_b[3]; src_a[0] = src_0_x2.val[0]; @@ -90,9 +94,12 @@ class SeparableFilter { auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); + typename BufferVecTraits::Vector2Type src_0_x2; + BufferVecTraits::load(&src_0[0], src_0_x2); + typename BufferVecTraits::Vector2Type src_1_x2; + BufferVecTraits::load(&src_1[0], src_1_x2); + typename BufferVecTraits::Vector2Type src_2_x2; + BufferVecTraits::load(&src_2[0], src_2_x2); BufferVectorType src_a[3], src_b[3]; src_a[0] = src_0_x2.val[0]; diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h index ecafd7d38..940cf2707 100644 --- a/kleidicv/include/kleidicv/neon.h +++ b/kleidicv/include/kleidicv/neon.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -173,6 +173,228 @@ class VecTraitsBase : public VectorTypes { // Maximum number of lanes in a vector. static constexpr size_t max_num_lanes() { return num_lanes(); } +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS + + private: + static inline int8x16x2_t vld1q_x2(const int8_t *src) { + return vld1q_s8_x2(src); + } + + static inline uint8x16x2_t vld1q_x2(const uint8_t *src) { + return vld1q_u8_x2(src); + } + + static inline int16x8x2_t vld1q_x2(const int16_t *src) { + return vld1q_s16_x2(src); + } + + static inline uint16x8x2_t vld1q_x2(const uint16_t *src) { + return vld1q_u16_x2(src); + } + + static inline int32x4x2_t vld1q_x2(const int32_t *src) { + return vld1q_s32_x2(src); + } + + static inline uint32x4x2_t vld1q_x2(const uint32_t *src) { + return vld1q_u32_x2(src); + } + + static inline int64x2x2_t vld1q_x2(const int64_t *src) { + return vld1q_s64_x2(src); + } + + static inline uint64x2x2_t vld1q_x2(const uint64_t *src) { + return vld1q_u64_x2(src); + } + + static inline float32x4x2_t vld1q_x2(const float32_t *src) { + return vld1q_f32_x2(src); + } + + static inline int8x16x3_t vld1q_x3(const int8_t *src) { + return vld1q_s8_x3(src); + } + + static inline uint8x16x3_t vld1q_x3(const uint8_t *src) { + return vld1q_u8_x3(src); + } + + static inline int16x8x3_t vld1q_x3(const int16_t *src) { + return vld1q_s16_x3(src); + } + + static inline uint16x8x3_t vld1q_x3(const uint16_t *src) { + return vld1q_u16_x3(src); + } + + static inline int32x4x3_t vld1q_x3(const int32_t *src) { + return vld1q_s32_x3(src); + } + + static inline uint32x4x3_t vld1q_x3(const uint32_t *src) { + return vld1q_u32_x3(src); + } + + static inline int64x2x3_t vld1q_x3(const int64_t *src) { + return vld1q_s64_x3(src); + } + + static inline uint64x2x3_t vld1q_x3(const uint64_t *src) { + return vld1q_u64_x3(src); + } + + static inline float32x4x3_t vld1q_x3(const float32_t *src) { + return vld1q_f32_x3(src); + } + + static inline int8x16x4_t vld1q_x4(const int8_t *src) { + return vld1q_s8_x4(src); + } + + static inline uint8x16x4_t vld1q_x4(const uint8_t *src) { + return vld1q_u8_x4(src); + } + + static inline int16x8x4_t vld1q_x4(const int16_t *src) { + return vld1q_s16_x4(src); + } + + static inline uint16x8x4_t vld1q_x4(const uint16_t *src) { + return vld1q_u16_x4(src); + } + + static inline int32x4x4_t vld1q_x4(const int32_t *src) { + return vld1q_s32_x4(src); + } + + static inline uint32x4x4_t vld1q_x4(const uint32_t *src) { + return vld1q_u32_x4(src); + } + + static inline int64x2x4_t vld1q_x4(const int64_t *src) { + return vld1q_s64_x4(src); + } + + static inline uint64x2x4_t vld1q_x4(const uint64_t *src) { + return vld1q_u64_x4(src); + } + + static inline float32x4x4_t vld1q_x4(const float32_t *src) { + return vld1q_f32_x4(src); + } + + static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) { + vst1q_s8_x2(dst, vec); + } + + static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) { + vst1q_u8_x2(dst, vec); + } + + static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) { + vst1q_s16_x2(dst, vec); + } + + static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) { + vst1q_u16_x2(dst, vec); + } + + static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) { + vst1q_s32_x2(dst, vec); + } + + static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) { + vst1q_u32_x2(dst, vec); + } + + static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) { + vst1q_s64_x2(dst, vec); + } + + static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) { + vst1q_u64_x2(dst, vec); + } + + static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { + vst1q_f32_x2(dst, vec); + } + + static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) { + vst1q_s8_x3(dst, vec); + } + + static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) { + vst1q_u8_x3(dst, vec); + } + + static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) { + vst1q_s16_x3(dst, vec); + } + + static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) { + vst1q_u16_x3(dst, vec); + } + + static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) { + vst1q_s32_x3(dst, vec); + } + + static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) { + vst1q_u32_x3(dst, vec); + } + + static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) { + vst1q_s64_x3(dst, vec); + } + + static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) { + vst1q_u64_x3(dst, vec); + } + + static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) { + vst1q_f32_x3(dst, vec); + } + + static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) { + vst1q_s8_x4(dst, vec); + } + + static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) { + vst1q_u8_x4(dst, vec); + } + + static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) { + vst1q_s16_x4(dst, vec); + } + + static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) { + vst1q_u16_x4(dst, vec); + } + + static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) { + vst1q_s32_x4(dst, vec); + } + + static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) { + vst1q_u32_x4(dst, vec); + } + + static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) { + vst1q_s64_x4(dst, vec); + } + + static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) { + vst1q_u64_x4(dst, vec); + } + + static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { + vst1q_f32_x4(dst, vec); + } + + public: +#endif + // Loads a single vector from 'src'. static inline void load(const ScalarType *src, VectorType &vec) { vec = vld1q(&src[0]); @@ -180,17 +402,32 @@ class VecTraitsBase : public VectorTypes { // Loads two consecutive vectors from 'src'. static inline void load(const ScalarType *src, Vector2Type &vec) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vec = vld1q_x2(&src[0]); +#else + vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())}; +#endif } // Loads three consecutive vectors from 'src'. static inline void load(const ScalarType *src, Vector3Type &vec) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vec = vld1q_x3(&src[0]); +#else + vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), + vld1q(&src[0] + (2 * num_lanes()))}; +#endif } // Loads four consecutive vectors from 'src'. static inline void load(const ScalarType *src, Vector4Type &vec) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vec = vld1q_x4(&src[0]); +#else + vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), + vld1q(&src[0] + (2 * num_lanes())), + vld1q(&src[0] + (3 * num_lanes()))}; +#endif } // Loads two consecutive vectors from 'src'. @@ -203,22 +440,47 @@ class VecTraitsBase : public VectorTypes { // Loads 2x2 consecutive vectors from 'src'. static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0, Vector2Type &vec_1) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vec_0 = vld1q_x2(&src[0]); vec_1 = vld1q_x2(&src[num_lanes() * 2]); +#else + vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())}; + vec_1 = {vld1q(&src[num_lanes() * 2]), + vld1q(&src[num_lanes() * 2] + num_lanes())}; +#endif } // Loads 2x3 consecutive vectors from 'src'. static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0, Vector3Type &vec_1) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vec_0 = vld1q_x3(&src[0]); vec_1 = vld1q_x3(&src[num_lanes() * 3]); +#else + vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), + vld1q(&src[0] + (2 * num_lanes()))}; + vec_1 = {vld1q(&src[num_lanes() * 3]), + vld1q(&src[num_lanes() * 3] + num_lanes()), + vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))}; +#endif } // Loads 2x4 consecutive vectors from 'src'. static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0, Vector4Type &vec_1) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vec_0 = vld1q_x4(&src[0]); vec_1 = vld1q_x4(&src[num_lanes() * 4]); + +#else + vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), + vld1q(&src[0] + (2 * num_lanes())), + vld1q(&src[0] + (3 * num_lanes()))}; + vec_1 = {vld1q(&src[num_lanes() * 4]), + vld1q(&src[num_lanes() * 4] + num_lanes()), + vld1q(&src[num_lanes() * 4] + (2 * num_lanes())), + vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))}; +#endif } // Stores a single vector to 'dst'. @@ -228,7 +490,35 @@ class VecTraitsBase : public VectorTypes { // Stores two consecutive vectors to 'dst'. static inline void store(Vector2Type vec, ScalarType *dst) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS vst1q_x2(&dst[0], vec); +#else + vst1q(&dst[0], vec.val[0]); + vst1q(&dst[0] + num_lanes(), vec.val[1]); +#endif + } + + // Stores three consecutive vectors to 'dst'. + static inline void store(Vector3Type vec, ScalarType *dst) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS + vst1q_x3(&dst[0], vec); +#else + vst1q(&dst[0], vec.val[0]); + vst1q(&dst[0] + num_lanes(), vec.val[1]); + vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]); +#endif + } + + // Stores four consecutive vectors to 'dst'. + static inline void store(Vector4Type vec, ScalarType *dst) { +#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS + vst1q_x4(&dst[0], vec); +#else + vst1q(&dst[0], vec.val[0]); + vst1q(&dst[0] + num_lanes(), vec.val[1]); + vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]); + vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]); +#endif } // Stores two consecutive vectors to 'dst'. diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h index 420a3733a..77a11c20d 100644 --- a/kleidicv/include/kleidicv/neon_intrinsics.h +++ b/kleidicv/include/kleidicv/neon_intrinsics.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -375,36 +375,6 @@ static inline int64x2x4_t vld4q(const int64_t *src) { return vld4q_s64(src); static inline uint64x2x4_t vld4q(const uint64_t *src) { return vld4q_u64(src); } static inline float32x4x4_t vld4q(const float32_t *src) { return vld4q_f32(src); } -static inline int8x16x2_t vld1q_x2(const int8_t *src) { return vld1q_s8_x2(src); } -static inline uint8x16x2_t vld1q_x2(const uint8_t *src) { return vld1q_u8_x2(src); } -static inline int16x8x2_t vld1q_x2(const int16_t *src) { return vld1q_s16_x2(src); } -static inline uint16x8x2_t vld1q_x2(const uint16_t *src) { return vld1q_u16_x2(src); } -static inline int32x4x2_t vld1q_x2(const int32_t *src) { return vld1q_s32_x2(src); } -static inline uint32x4x2_t vld1q_x2(const uint32_t *src) { return vld1q_u32_x2(src); } -static inline int64x2x2_t vld1q_x2(const int64_t *src) { return vld1q_s64_x2(src); } -static inline uint64x2x2_t vld1q_x2(const uint64_t *src) { return vld1q_u64_x2(src); } -static inline float32x4x2_t vld1q_x2(const float32_t *src) { return vld1q_f32_x2(src); } - -static inline int8x16x3_t vld1q_x3(const int8_t *src) { return vld1q_s8_x3(src); } -static inline uint8x16x3_t vld1q_x3(const uint8_t *src) { return vld1q_u8_x3(src); } -static inline int16x8x3_t vld1q_x3(const int16_t *src) { return vld1q_s16_x3(src); } -static inline uint16x8x3_t vld1q_x3(const uint16_t *src) { return vld1q_u16_x3(src); } -static inline int32x4x3_t vld1q_x3(const int32_t *src) { return vld1q_s32_x3(src); } -static inline uint32x4x3_t vld1q_x3(const uint32_t *src) { return vld1q_u32_x3(src); } -static inline int64x2x3_t vld1q_x3(const int64_t *src) { return vld1q_s64_x3(src); } -static inline uint64x2x3_t vld1q_x3(const uint64_t *src) { return vld1q_u64_x3(src); } -static inline float32x4x3_t vld1q_x3(const float32_t *src) { return vld1q_f32_x3(src); } - -static inline int8x16x4_t vld1q_x4(const int8_t *src) { return vld1q_s8_x4(src); } -static inline uint8x16x4_t vld1q_x4(const uint8_t *src) { return vld1q_u8_x4(src); } -static inline int16x8x4_t vld1q_x4(const int16_t *src) { return vld1q_s16_x4(src); } -static inline uint16x8x4_t vld1q_x4(const uint16_t *src) { return vld1q_u16_x4(src); } -static inline int32x4x4_t vld1q_x4(const int32_t *src) { return vld1q_s32_x4(src); } -static inline uint32x4x4_t vld1q_x4(const uint32_t *src) { return vld1q_u32_x4(src); } -static inline int64x2x4_t vld1q_x4(const int64_t *src) { return vld1q_s64_x4(src); } -static inline uint64x2x4_t vld1q_x4(const uint64_t *src) { return vld1q_u64_x4(src); } -static inline float32x4x4_t vld1q_x4(const float32_t *src) { return vld1q_f32_x4(src); } - // ----------------------------------------------------------------------------- // NEON store operations // ----------------------------------------------------------------------------- @@ -458,26 +428,6 @@ static inline void vst4q(int64_t *dst, int64x2x4_t vec) { vst4q_s64(dst, vec static inline void vst4q(uint64_t *dst, uint64x2x4_t vec) { vst4q_u64(dst, vec); } static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); } -static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) { vst1q_s8_x2(dst, vec); } -static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) { vst1q_u8_x2(dst, vec); } -static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) { vst1q_s16_x2(dst, vec); } -static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) { vst1q_u16_x2(dst, vec); } -static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) { vst1q_s32_x2(dst, vec); } -static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) { vst1q_u32_x2(dst, vec); } -static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) { vst1q_s64_x2(dst, vec); } -static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) { vst1q_u64_x2(dst, vec); } -static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { vst1q_f32_x2(dst, vec); } - -static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) { vst1q_s8_x4(dst, vec); } -static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) { vst1q_u8_x4(dst, vec); } -static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) { vst1q_s16_x4(dst, vec); } -static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) { vst1q_u16_x4(dst, vec); } -static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) { vst1q_s32_x4(dst, vec); } -static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) { vst1q_u32_x4(dst, vec); } -static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) { vst1q_s64_x4(dst, vec); } -static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) { vst1q_u64_x4(dst, vec); } -static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { vst1q_f32_x4(dst, vec); } - // ----------------------------------------------------------------------------- // vreinterpret* // ----------------------------------------------------------------------------- diff --git a/kleidicv/src/arithmetics/in_range_neon.cpp b/kleidicv/src/arithmetics/in_range_neon.cpp index d0614d926..f03d9ddec 100644 --- a/kleidicv/src/arithmetics/in_range_neon.cpp +++ b/kleidicv/src/arithmetics/in_range_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -58,7 +58,9 @@ class InRange { Columns dst) { LoopUnroll{width, SrcVecTraits::num_lanes()} .unroll_n_times<4>([&](size_t step) { - SrcVector4Type src_vector = vld1q_f32_x4(&src[0]); + SrcVector4Type src_vector; + SrcVecTraits::load(&src[0], src_vector); + DstVectorType result_vector = vector_path(src_vector); vst1q(&dst[0], result_vector); src += ptrdiff_t(step); diff --git a/kleidicv/src/arithmetics/scale_neon.cpp b/kleidicv/src/arithmetics/scale_neon.cpp index 7c46ec045..60621cca9 100644 --- a/kleidicv/src/arithmetics/scale_neon.cpp +++ b/kleidicv/src/arithmetics/scale_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -88,16 +88,18 @@ class ScaleUint8Tbx final : public ScaleIntBase { for (size_t i = 0; i < TableLength; ++i) { values[i] = this->scale_value(i); } - t0_3_ = vld1q_u8_x3(values); - t1_3_ = vld1q_u8_x3(values + 3 * VecTraits::num_lanes()); - t2_2_ = vld1q_u8_x2(values + (3 + 3) * VecTraits::num_lanes()); - t3_3_ = vld1q_u8_x3(values + (3 + 3 + 2) * VecTraits::num_lanes()); - t4_2_ = vld1q_u8_x2(values + (3 + 3 + 2 + 3) * VecTraits::num_lanes()); - t5_3_ = vld1q_u8_x3(values + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes()); + + VecTraits::load(values, t0_3_); + VecTraits::load(values + 3 * VecTraits::num_lanes(), t1_3_); + VecTraits::load(values + (3 + 3) * VecTraits::num_lanes(), t2_2_); + VecTraits::load(values + (3 + 3 + 2) * VecTraits::num_lanes(), t3_3_); + VecTraits::load(values + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_); + VecTraits::load(values + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(), + t5_3_); + v_step3_ = vdupq_n_u8(3 * VecTraits::num_lanes()); v_step2_ = vdupq_n_u8(2 * VecTraits::num_lanes()); } - VectorType vector_path(VectorType src) { VectorType dst = vqtbl3q_u8(t0_3_, src); src = vsubq_u8(src, v_step3_); diff --git a/kleidicv/src/conversions/float_conv_neon.cpp b/kleidicv/src/conversions/float_conv_neon.cpp index e220dd914..1ef00c5eb 100644 --- a/kleidicv/src/conversions/float_conv_neon.cpp +++ b/kleidicv/src/conversions/float_conv_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -124,13 +124,14 @@ s8_to_f32(const int8_t* src, size_t src_stride, float* dst, size_t dst_stride, int32x4_t c = vreinterpretq_s32_s8(vqtbl1q_s8(input, index2)); int32x4_t d = vreinterpretq_s32_s8(vqtbl1q_s8(input, index3)); // Convert to float and divide by 2^24. + float32x4x4_t output = { vcvtq_n_f32_s32(a, 24), vcvtq_n_f32_s32(b, 24), vcvtq_n_f32_s32(c, 24), vcvtq_n_f32_s32(d, 24), }; - vst1q_f32_x4(dst + x, output); + neon::VecTraits::store(output, dst + x); } for (; x != width; ++x) { disable_loop_vectorization(); @@ -169,13 +170,14 @@ u8_to_f32(const uint8_t* src, size_t src_stride, float* dst, size_t dst_stride, uint32x4_t b = vreinterpretq_u32_u8(vqtbl1q_u8(input, index1)); uint32x4_t c = vreinterpretq_u32_u8(vqtbl1q_u8(input, index2)); uint32x4_t d = vreinterpretq_u32_u8(vqtbl1q_u8(input, index3)); + float32x4x4_t output = { vcvtq_f32_u32(a), vcvtq_f32_u32(b), vcvtq_f32_u32(c), vcvtq_f32_u32(d), }; - vst1q_f32_x4(dst + x, output); + neon::VecTraits::store(output, dst + x); } for (; x != width; ++x) { disable_loop_vectorization(); diff --git a/kleidicv/src/conversions/gray_to_rgb_neon.cpp b/kleidicv/src/conversions/gray_to_rgb_neon.cpp index 6b0d15256..6aceacc3a 100644 --- a/kleidicv/src/conversions/gray_to_rgb_neon.cpp +++ b/kleidicv/src/conversions/gray_to_rgb_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -15,7 +15,9 @@ class GrayToRGB final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE - GrayToRGB() : indices_{vld1q_u8_x3(kGrayToRGBTableIndices)} {} + GrayToRGB() : indices_{} { + VecTraits::load(kGrayToRGBTableIndices, indices_); + } #else GrayToRGB() = default; #endif @@ -31,7 +33,7 @@ class GrayToRGB final : public UnrollTwice { dst_vect.val[0] = vqtbl1q_u8(src_vect, indices_.val[0]); dst_vect.val[1] = vqtbl1q_u8(src_vect, indices_.val[1]); dst_vect.val[2] = vqtbl1q_u8(src_vect, indices_.val[2]); - vst1q_u8_x3(dst, dst_vect); + VecTraits::store(dst_vect, dst); #endif } @@ -41,11 +43,11 @@ class GrayToRGB final : public UnrollTwice { private: #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE + static constexpr uint8_t kGrayToRGBTableIndices[48] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15}; - uint8x16x3_t indices_; #endif }; // end of class GrayToRGB @@ -60,7 +62,8 @@ class GrayToRGBA final : public UnrollTwice { GrayToRGBA() : alpha_{vdupq_n_u8(0xff)} {} #else // NOLINTBEGIN(hicpp-member-init) - GrayToRGBA() : indices_{vld1q_u8_x4(kGrayToRGBATableIndices)} { + GrayToRGBA() : indices_{} { + VecTraits::load(kGrayToRGBATableIndices, indices_); src_and_alpha_.val[1] = vdupq_n_u8(0xff); } // NOLINTEND(hicpp-member-init) @@ -80,7 +83,8 @@ class GrayToRGBA final : public UnrollTwice { dst_vect.val[1] = vqtbl2q_u8(src_and_alpha_, indices_.val[1]); dst_vect.val[2] = vqtbl2q_u8(src_and_alpha_, indices_.val[2]); dst_vect.val[3] = vqtbl2q_u8(src_and_alpha_, indices_.val[3]); - vst1q_u8_x4(dst, dst_vect); + VecTraits::store(dst_vect, dst); + #endif } @@ -101,6 +105,7 @@ class GrayToRGBA final : public UnrollTwice { 4, 4, 4, 16, 5, 5, 5, 16, 6, 6, 6, 16, 7, 7, 7, 16, 8, 8, 8, 16, 9, 9, 9, 16, 10, 10, 10, 16, 11, 11, 11, 16, 12, 12, 12, 16, 13, 13, 13, 16, 14, 14, 14, 16, 15, 15, 15, 16}; + #endif }; // end of class GrayToRGBA diff --git a/kleidicv/src/conversions/merge_neon.cpp b/kleidicv/src/conversions/merge_neon.cpp index 5b7bc0157..aaef9e8bf 100644 --- a/kleidicv/src/conversions/merge_neon.cpp +++ b/kleidicv/src/conversions/merge_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -41,11 +41,12 @@ class Merge2 final : public UnrollTwice { dst_vect.val[0] = src_a; dst_vect.val[1] = src_b; vst2q(&dst[0], dst_vect); -#else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE +#else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE Vector2Type dst_vect; dst_vect.val[0] = vzip1q(src_a, src_b); dst_vect.val[1] = vzip2q(src_a, src_b); - vst1q_x2(&dst[0], dst_vect); + VecTraits::store(dst_vect, &dst[0]); + #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE } @@ -68,26 +69,30 @@ class Merge3 final : public UnrollTwice { using Vector3Type = typename VecTraits::Vector3Type; #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE - Merge3() : table_indices_{vld1q_u8_x3(lookup_table(ScalarType()))} {} + + Merge3() : table_indices_{} { + neon::VecTraits::load(lookup_table(ScalarType()), table_indices_); + } + #endif void vector_path(VectorType src_a, VectorType src_b, VectorType src_c, ScalarType *dst) { -#if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE Vector3Type dst_vect; +#if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE dst_vect.val[0] = src_a; dst_vect.val[1] = src_b; dst_vect.val[2] = src_c; vst3q(&dst[0], dst_vect); #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE - uint8x16x3_t src_vect, dst_vect; + uint8x16x3_t src_vect; src_vect.val[0] = vreinterpretq_u8(src_a); src_vect.val[1] = vreinterpretq_u8(src_b); src_vect.val[2] = vreinterpretq_u8(src_c); dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]); dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]); dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]); - vst1q_u8_x3(reinterpret_cast(&dst[0]), dst_vect); + VecTraits::store(dst_vect, &dst[0]); #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE } @@ -100,21 +105,20 @@ class Merge3 final : public UnrollTwice { private: #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE - static const uint8_t *lookup_table(uint8_t) { + static uint8_t *lookup_table(uint8_t) { // clang-format off - static constexpr uint8_t kIndices[48] = { + static uint8_t kIndices[48] = { 0, 16, 32, 1, 17, 33, 2, 18, 34, 3, 19, 35, 4, 20, 36, 5, 21, 37, 6, 22, 38, 7, 23, 39, 8, 24, 40, 9, 25, 41, 10, 26, 42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, }; - // clang-format on return &kIndices[0]; } // Lookup table for 16-bit inputs. - static const uint8_t *lookup_table(uint16_t) { + static uint8_t *lookup_table(uint16_t) { // clang-format off - static constexpr uint8_t kIndices[48] = { + static uint8_t kIndices[48] = { 0, 1, 16, 17, 32, 33, 2, 3, 18, 19, 34, 35, 4, 5, 20, 21, 36, 37, 6, 7, 22, 23, 38, 39, 8, 9, 24, 25, 40, 41, 10, 11, 26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47, @@ -215,7 +219,8 @@ class Merge3 final : public UnrollTwice { dst_vect.val[1] = src_c; dst_vect.val[1][1] = src_a[1]; dst_vect.val[2] = vzip2q_u64(src_b, src_c); - vst1q_u64_x3(&dst[0], dst_vect); + + VecTraits::store(dst_vect, &dst[0]); } void scalar_path(const ScalarType *src_a, const ScalarType *src_b, @@ -273,7 +278,7 @@ class Merge4 final : public UnrollTwice { dst_vect.val[2] = src_c; dst_vect.val[3] = src_d; vst4q(&dst[0], dst_vect); -#else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE +#else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE auto zip1_a_b = double_width(vzip1q(src_a, src_b)); auto zip1_c_d = double_width(vzip1q(src_c, src_d)); auto zip2_a_b = double_width(vzip2q(src_a, src_b)); @@ -287,7 +292,9 @@ class Merge4 final : public UnrollTwice { dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d); dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d); dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d); - vst1q_x4(reinterpret_cast(&dst[0]), dst_vect); + neon::VecTraits::store( + dst_vect, reinterpret_cast(&dst[0])); + #endif // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE } @@ -346,7 +353,7 @@ class Merge4 final : public UnrollTwice { dst_vect.val[1] = vzip1q(src_c, src_d); dst_vect.val[2] = vzip2q(src_a, src_b); dst_vect.val[3] = vzip2q(src_c, src_d); - vst1q_x4(&dst[0], dst_vect); + VecTraits::store(dst_vect, &dst[0]); } void scalar_path(const ScalarType *src_a, const ScalarType *src_b, diff --git a/kleidicv/src/conversions/rgb_to_rgb_neon.cpp b/kleidicv/src/conversions/rgb_to_rgb_neon.cpp index 001fe94a7..c2cba62c5 100644 --- a/kleidicv/src/conversions/rgb_to_rgb_neon.cpp +++ b/kleidicv/src/conversions/rgb_to_rgb_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -14,7 +14,7 @@ class RGBToBGR final : public UnrollTwice { using VecTraits = neon::VecTraits; #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE - RGBToBGR() : indices_{vld1q_u8_x3(kRGBToBGRTableIndices)} {} + RGBToBGR() : indices_{} { VecTraits::load(kRGBToBGRTableIndices, indices_); } #else RGBToBGR() = default; #endif @@ -30,7 +30,10 @@ class RGBToBGR final : public UnrollTwice { vst3q_u8(dst, dst_vect); #else - uint8x16x3_t src_vect = vld1q_u8_x3(src); + + uint8x16x3_t src_vect; + VecTraits::load(src, src_vect); + uint8x16x3_t dst_vect; uint8x16x2_t src_vect_0_1; @@ -45,7 +48,7 @@ class RGBToBGR final : public UnrollTwice { dst_vect.val[1] = vqtbl3q_u8(src_vect, indices_.val[1]); dst_vect.val[2] = vqtbl2q_u8(src_vect_1_2, indices_.val[2]); - vst1q_u8_x3(dst, dst_vect); + VecTraits::store(dst_vect, dst); #endif } @@ -62,7 +65,6 @@ class RGBToBGR final : public UnrollTwice { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21, 26, 25, 24, 29, 28, 27, 32, 31, 14, 19, 18, 17, 22, 21, 20, 25, 24, 23, 28, 27, 26, 31, 30, 29}; - uint8x16x3_t indices_; #endif }; // end of class RGBToBGR diff --git a/kleidicv/src/conversions/rgb_to_yuv_neon.cpp b/kleidicv/src/conversions/rgb_to_yuv_neon.cpp index c8b89502a..55a8b8f0d 100644 --- a/kleidicv/src/conversions/rgb_to_yuv_neon.cpp +++ b/kleidicv/src/conversions/rgb_to_yuv_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -28,7 +28,8 @@ class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop { RawSourceVectorType vsrc; int16x8_t r_l, r_h, g_l, g_h, b_l, b_h; if constexpr (ALPHA) { - vsrc = vld1q_u8_x4(src); + VecTraits::load(src, vsrc); + uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]); uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]); if constexpr (BGR) { diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp index 590894cde..1deacceae 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -28,8 +28,11 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop { 128 * (kUVWeights[1] + kUVWeights[2]))}, b_base_{vdupq_n_s32(static_cast(1 << (kWeightScale - 1)) - 128 * kUVWeights[3])}, - de_interleave_indices_{vld1q_s8_x4(kDeInterleaveTableIndices)}, - is_nv21_(is_nv21) {} + de_interleave_indices_{}, + is_nv21_(is_nv21) { + neon::VecTraits::load(kDeInterleaveTableIndices, + de_interleave_indices_); + } // Returns the number of channels in the output image. static constexpr size_t output_channels() { @@ -266,8 +269,8 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop { int8x16x4_t de_interleave_indices_; const bool is_nv21_; - // clang-format off + static constexpr int8_t kDeInterleaveTableIndices[64] = { /* low and even */ 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1, @@ -278,6 +281,7 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop { /* high and odd */ 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1, }; + // clang-format on }; // end of class YUVSpToRGBxOrBGRx diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index af3b5baef..b28de5876 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -37,7 +37,7 @@ class GaussianBlur { using BufferVectorType = typename VecTraits::VectorType; using DestinationType = ScalarType; - explicit GaussianBlur([[maybe_unused]] float sigma) {} + explicit GaussianBlur([[maybe_unused]] float sigma) // Applies vertical filtering vector using SIMD operations. // @@ -179,7 +179,7 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur([[maybe_unused]] float sigma) +explicit GaussianBlur([[maybe_unused]] float sigma) : const_7_u16_{vdupq_n_u16(7)}, const_7_u32_{vdupq_n_u32(7)}, const_9_u16_{vdupq_n_u16(9)} {} @@ -434,8 +434,7 @@ class GaussianBlur { vmlal_u16(acc.val[2], vget_low_u16(acc_7_h), const_158_u16_half_); acc.val[3] = vmlal_u16(acc.val[3], vget_high_u16(acc_7_h), const_158_u16_half_); - - vst1q_u32_x4(&dst[0], acc); + neon::VecTraits::store(acc, &dst[0]); } // Applies vertical filtering vector using scalar operations. @@ -565,8 +564,7 @@ class GaussianBlur { } uint32x4x4_t result = {acc_l_l, acc_l_h, acc_h_l, acc_h_h}; - - vst1q_u32_x4(&dst[0], result); + neon::VecTraits::store(result, &dst[0]); } void vertical_scalar_path(const SourceType src[KernelSize], diff --git a/kleidicv/src/resize/resize_linear_neon.cpp b/kleidicv/src/resize/resize_linear_neon.cpp index 7b6d1d54d..e56bfde67 100644 --- a/kleidicv/src/resize/resize_linear_neon.cpp +++ b/kleidicv/src/resize/resize_linear_neon.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -6,6 +6,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/operations.h" #include "kleidicv/resize/resize_linear.h" namespace kleidicv::neon { @@ -887,38 +888,43 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d); dst_0.val[1] = lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d); - vst1q_x2(dst_row0, dst_0); + neon::VecTraits::store(dst_0, dst_row0); float32x4x2_t dst_7; dst_7.val[0] = lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d); dst_7.val[1] = lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d); - vst1q_x2(dst_row7, dst_7); + neon::VecTraits::store(dst_7, dst_row7); float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]); float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]); float32x4x2_t dst; dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0); dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1); - vst1q_x2(dst_row1, dst); + + neon::VecTraits::store(dst, dst_row1); dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0); dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1); - vst1q_x2(dst_row2, dst); + + neon::VecTraits::store(dst, dst_row2); dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0); dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1); - vst1q_x2(dst_row3, dst); + + neon::VecTraits::store(dst, dst_row3); dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0); dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1); - vst1q_x2(dst_row4, dst); + + neon::VecTraits::store(dst, dst_row4); dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0); dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1); - vst1q_x2(dst_row5, dst); + + neon::VecTraits::store(dst, dst_row5); dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0); dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1); - vst1q_x2(dst_row6, dst); + neon::VecTraits::store(dst, dst_row6); dst_row0 += 8; dst_row1 += 8; dst_row2 += 8; diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp index 40d8fa85a..102d7d3c9 100644 --- a/kleidicv/src/transform/remap_s16point5_neon.cpp +++ b/kleidicv/src/transform/remap_s16point5_neon.cpp @@ -602,7 +602,8 @@ inline uint8x16_t load_4px_4ch(Rows src_rows, } inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns dst) { - vst1q_u8_x2(&dst[0], res); + using ScalarType = uint8_t; + neon::VecTraits::store(res, &dst[0]); } inline uint16x8_t load_2px_4ch(Rows src_rows, @@ -612,7 +613,8 @@ inline uint16x8_t load_2px_4ch(Rows src_rows, } inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns dst) { - vst1q_u16_x4(&dst[0], res); + using ScalarType = uint16_t; + neon::VecTraits::store(res, &dst[0]); } // Replicate border specific functions -- GitLab From ab857d42e21e3774df96b3a562813dca40795072 Mon Sep 17 00:00:00 2001 From: Luna Lamb Date: Thu, 27 Mar 2025 11:14:14 +0000 Subject: [PATCH 2/4] Remove cfloat/climits headers from resize test api --- test/api/test_resize_linear.cpp | 62 ++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/test/api/test_resize_linear.cpp b/test/api/test_resize_linear.cpp index 576ec8684..409741c28 100644 --- a/test/api/test_resize_linear.cpp +++ b/test/api/test_resize_linear.cpp @@ -1,11 +1,10 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 #include #include -#include #include #include #include @@ -623,11 +622,14 @@ INSTANTIATE_TEST_SUITE_P( Pf32{{{0, 255}}, {{0, 63.75F, 191.25F, 255}, {0, 63.75F, 191.25F, 255}}}, // 2*2 -> 4*4 - Pf32{{{FLT_MAX, 1e38F}, {0, FLT_TRUE_MIN}}, - {{FLT_MAX, 2.8021173e38F, 1.6007058e38F, 1e38F}, + Pf32{{{std::numeric_limits::max(), 1e38F}, + {0, std::numeric_limits::denorm_min()}}, + {{std::numeric_limits::max(), 2.8021173e38F, 1.6007058e38F, + 1e38F}, {2.5521173e38F, 2.101588e38F, 1.2005294e38F, 7.5e37F}, {8.5070577e37F, 7.0052933e37F, 4.0017645e37F, 2.5e37F}, - {0, 0, FLT_TRUE_MIN, FLT_TRUE_MIN}}}, + {0, 0, std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min()}}}, // 3*3 -> 6*6 Pf32{{{1, 63, 164}, {28, 251, 35}, {218, 64, 99}}, {{1, 16.5F, 47.5F, 88.25F, 138.75F, 164}, @@ -655,12 +657,13 @@ INSTANTIATE_TEST_SUITE_P( 37.5F}, {200, 175, 125, 112.5F, 137.5F, 125, 75, 50}}}, // 35*2 -> 70*4 + // clang-format off Pf32{{{0, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, FLT_MAX, FLT_MAX, 104, 108, 227, 46, 162, + 9, 10, std::numeric_limits::max(), std::numeric_limits::max(), 104, 108, 227, 46, 162, 21, 220, 235, 183, 113, 225, 146, 196, 144, 104, 148, 19, 126, 172, 9, 12, 61}, - {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, FLT_MAX, - FLT_MAX, 105, 191, 106, 73, 148, 13, 161, 118, 21, 3, 34, + {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, std::numeric_limits::max(), + std::numeric_limits::max(), 105, 191, 106, 73, 148, 13, 161, 118, 21, 3, 34, 40, 150, 120, 68, 75, 14, 31, 124, 221, 214, 146}}, {{0.0F, 0.25F, 0.75F, 1.25F, 1.75F, 2.25F, 2.75F, 3.25F, @@ -735,10 +738,10 @@ INSTANTIATE_TEST_SUITE_P( 196.75F, 219.25F, 215.75F, 197.0F, 163.0F, 146.0F}}}, // 2*2 -> 8*8 - Pf32{{{FLT_MAX, 1e38F}, {0, FLT_TRUE_MIN}}, - {{FLT_MAX, FLT_MAX, 3.10247e38F, 2.5017644e38F, 1.9010587e38F, + Pf32{{{std::numeric_limits::max(), 1e38F}, {0, std::numeric_limits::denorm_min()}}, + {{std::numeric_limits::max(), std::numeric_limits::max(), 3.10247e38F, 2.5017644e38F, 1.9010587e38F, 1.3003528e38F, 1e38F, 1e38F}, - {FLT_MAX, FLT_MAX, 3.1024702e38F, 2.5017644e38F, 1.9010587e38F, + {std::numeric_limits::max(), std::numeric_limits::max(), 3.1024702e38F, 2.5017644e38F, 1.9010587e38F, 1.3003528e38F, 1e38F, 1e38F}, {2.9774701e38F, 2.9774701e38F, 2.7146614e38F, 2.189044e38F, 1.6634263e38F, 1.1378087e38F, 8.75e37F, 8.75e37F}, @@ -748,10 +751,36 @@ INSTANTIATE_TEST_SUITE_P( 7.1289698e37F, 4.8763233e37F, 3.75e37F, 3.75e37F}, {4.2535288e37F, 4.2535288e37F, 3.8780877e37F, 3.1272055e37F, 2.3763234e37F, 1.6254411e37F, 1.25e37F, 1.25e37F}, - {0, 0, 0, 0, FLT_TRUE_MIN, FLT_TRUE_MIN, FLT_TRUE_MIN, - FLT_TRUE_MIN}, - {0, 0, 0, 0, FLT_TRUE_MIN, FLT_TRUE_MIN, FLT_TRUE_MIN, - FLT_TRUE_MIN}}}, + {0, 0, 0, 0, std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min()}, + {0, 0, 0, 0, std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min()}}}, + // 35*2 -> 140*8 + Pf32{{{std::numeric_limits::max(), 1e38F}, + {0, std::numeric_limits::denorm_min()}}, + {{std::numeric_limits::max(), + std::numeric_limits::max(), 3.10247e38F, 2.5017644e38F, + 1.9010587e38F, 1.3003528e38F, 1e38F, 1e38F}, + {std::numeric_limits::max(), + std::numeric_limits::max(), 3.1024702e38F, 2.5017644e38F, + 1.9010587e38F, 1.3003528e38F, 1e38F, 1e38F}, + {2.9774701e38F, 2.9774701e38F, 2.7146614e38F, 2.189044e38F, + 1.6634263e38F, 1.1378087e38F, 8.75e37F, 8.75e37F}, + {2.1267644e38F, 2.1267644e38F, 1.9390438e38F, 1.5636028e38F, + 1.1881617e38F, 8.1272051e37F, 6.25e37F, 6.25e37F}, + {1.2760587e38F, 1.2760587e38F, 1.1634263e38F, 9.3816164e37F, + 7.1289698e37F, 4.8763233e37F, 3.75e37F, 3.75e37F}, + {4.2535288e37F, 4.2535288e37F, 3.8780877e37F, 3.1272055e37F, + 2.3763234e37F, 1.6254411e37F, 1.25e37F, 1.25e37F}, + {0, 0, 0, 0, std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min()}, + {0, 0, 0, 0, std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min(), + std::numeric_limits::denorm_min()}}}, + // clang-format on // 35*2 -> 140*8 Pf32{{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 82, 155, 104, 108, 227, 46, 162, 21, 220, 235, 183, 113, 225, @@ -968,7 +997,8 @@ INSTANTIATE_TEST_SUITE_P( 216.625F, 214.875F, 205.5F, 188.5F, 171.5F, 154.5F, 146, 146}}}, // 2*2 -> 16*16 - Pf32{{{FLT_MAX, 1e38F}, {0, FLT_TRUE_MIN}}, + Pf32{{{std::numeric_limits::max(), 1e38F}, + {0, std::numeric_limits::denorm_min()}}, { {3.402823466e+38F, 3.402823466e+38F, 3.402823466e+38F, 3.402823466e+38F, 3.252647029e+38F, 2.952294156e+38F, -- GitLab From b7a40ef97d0a6f1b08b912714a630ae7d30972af Mon Sep 17 00:00:00 2001 From: Luna Lamb Date: Fri, 28 Feb 2025 12:03:34 +0000 Subject: [PATCH 3/4] Refactor maybe_unused statements in neon gaussian blur Prior formatting for maybe_unused statements is not supported on GCC8, code is refactored to increase compiler compatibility. --- kleidicv/src/filters/gaussian_blur_neon.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index b28de5876..5503b1eb8 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -37,7 +37,7 @@ class GaussianBlur { using BufferVectorType = typename VecTraits::VectorType; using DestinationType = ScalarType; - explicit GaussianBlur([[maybe_unused]] float sigma) + explicit GaussianBlur(float sigma [[maybe_unused]]) {} // Applies vertical filtering vector using SIMD operations. // @@ -98,7 +98,7 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur([[maybe_unused]] float sigma) + explicit GaussianBlur(float sigma [[maybe_unused]]) : const_6_u8_half_{vdup_n_u8(6)}, const_6_u16_{vdupq_n_u16(6)}, const_4_u16_{vdupq_n_u16(4)} {} @@ -179,7 +179,7 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; -explicit GaussianBlur([[maybe_unused]] float sigma) + explicit GaussianBlur(float sigma [[maybe_unused]]) : const_7_u16_{vdupq_n_u16(7)}, const_7_u32_{vdupq_n_u32(7)}, const_9_u16_{vdupq_n_u16(9)} {} @@ -327,7 +327,7 @@ class GaussianBlur { using BufferType = uint32_t; using DestinationType = uint8_t; - explicit GaussianBlur([[maybe_unused]] float sigma) + explicit GaussianBlur(float sigma [[maybe_unused]]) : const_11_u16_{vdupq_n_u16(11)}, const_11_u32_{vdupq_n_u32(11)}, const_25_u16_{vdupq_n_u16(25)}, -- GitLab From 74975482444561e654d5cd6ac13b7c468264201a Mon Sep 17 00:00:00 2001 From: Luna Lamb Date: Mon, 24 Mar 2025 12:59:27 +0000 Subject: [PATCH 4/4] Extend CI script to test MultiVec flag --- scripts/ci.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/ci.sh b/scripts/ci.sh index f2d097b8c..a48b29aa7 100755 --- a/scripts/ci.sh +++ b/scripts/ci.sh @@ -95,7 +95,7 @@ if [[ $(dpkg --print-architecture) = arm64 ]]; then build/ci/sanitize/test/api/kleidicv-api-test fi -# Build benchmarks, just to prevent bitrot. +# Build benchmarks and without continuous load/store code path, just to prevent bitrot. cmake -S . -B build/ci/build-benchmark -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \ @@ -107,7 +107,8 @@ cmake -S . -B build/ci/build-benchmark -G Ninja \ -DKLEIDICV_BENCHMARK=ON \ -DKLEIDICV_ENABLE_SME2=ON \ -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF \ - -DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF + -DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF \ + -DKLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS=OFF ninja -C build/ci/build-benchmark kleidicv-benchmark # TODO: Cross-build OpenCV -- GitLab