From 521f705df6c689f9ea5157c684eedeba9c41604b Mon Sep 17 00:00:00 2001
From: Luna Lamb <luna.lamb@arm.com>
Date: Wed, 19 Feb 2025 17:14:30 +0000
Subject: [PATCH 1/4] Remove continuous multivec LS instructions on GCC <= 11

Continuous loads and store instructions produce sub optimal code generation on GCC version <= 11,
these instructions are not supported on GCC version <=8.
This commit ensures non-continuous load and store instructions are used instead,
when the library is compiled with GCC11 and prior.
---
 kleidicv/CMakeLists.txt                       |   8 +-
 kleidicv/include/kleidicv/config.h.in         |   4 +-
 .../filters/separable_filter_3x3_neon.h       |  21 +-
 kleidicv/include/kleidicv/neon.h              | 292 +++++++++++++++++-
 kleidicv/include/kleidicv/neon_intrinsics.h   |  52 +---
 kleidicv/src/arithmetics/in_range_neon.cpp    |   6 +-
 kleidicv/src/arithmetics/scale_neon.cpp       |  18 +-
 kleidicv/src/conversions/float_conv_neon.cpp  |   8 +-
 kleidicv/src/conversions/gray_to_rgb_neon.cpp |  17 +-
 kleidicv/src/conversions/merge_neon.cpp       |  39 ++-
 kleidicv/src/conversions/rgb_to_rgb_neon.cpp  |  12 +-
 kleidicv/src/conversions/rgb_to_yuv_neon.cpp  |   5 +-
 .../src/conversions/yuv_sp_to_rgb_neon.cpp    |  12 +-
 kleidicv/src/filters/gaussian_blur_neon.cpp   |  12 +-
 kleidicv/src/resize/resize_linear_neon.cpp    |  24 +-
 .../src/transform/remap_s16point5_neon.cpp    |   6 +-
 16 files changed, 411 insertions(+), 125 deletions(-)

diff --git a/kleidicv/CMakeLists.txt b/kleidicv/CMakeLists.txt
index 0b0fa2c81..d152093c6 100644
--- a/kleidicv/CMakeLists.txt
+++ b/kleidicv/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -51,6 +51,12 @@ option(KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE "Internal - If turned ON interlea
 option(KLEIDICV_EXPERIMENTAL_FEATURE_CANNY "Internal - Enable experimental Canny algorithm" OFF)
 option(KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV "Internal - If turned ON Canny algorithm creates bit exact result compared to OpenCV's original implementation" ON)
 
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11 OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") 
+  set(KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS ON)
+else()
+  set(KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS OFF)
+endif()
+
 if(KLEIDICV_ENABLE_SME2 AND NOT KLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS)
   set(KLEIDICV_ALWAYS_ENABLE_SME2 ON)
 endif()
diff --git a/kleidicv/include/kleidicv/config.h.in b/kleidicv/include/kleidicv/config.h.in
index 18a0970f3..39930f9b3 100644
--- a/kleidicv/include/kleidicv/config.h.in
+++ b/kleidicv/include/kleidicv/config.h.in
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -21,6 +21,8 @@
 
 #cmakedefine01 KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV
 
+#cmakedefine01 KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
+
 // Set to '1' if compiling NEON code paths, otherwise it is set to '0'.
 #ifndef KLEIDICV_TARGET_NEON
 #define KLEIDICV_TARGET_NEON 0
diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h
index 3fecea047..a461facad 100644
--- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h
+++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h
@@ -1,10 +1,11 @@
-// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H
 #define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H
 
+#include "kleidicv/config.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/workspace/border_3x3.h"
 
@@ -45,9 +46,12 @@ class SeparableFilter<FilterType, 3UL> {
       auto src_1 = &src_rows.at(border_offsets.c1())[index];
       auto src_2 = &src_rows.at(border_offsets.c2())[index];
 
-      auto src_0_x2 = vld1q_x2(&src_0[0]);
-      auto src_1_x2 = vld1q_x2(&src_1[0]);
-      auto src_2_x2 = vld1q_x2(&src_2[0]);
+      typename SourceVecTraits::Vector2Type src_0_x2;
+      SourceVecTraits::load(&src_0[0], src_0_x2);
+      typename SourceVecTraits::Vector2Type src_1_x2;
+      SourceVecTraits::load(&src_1[0], src_1_x2);
+      typename SourceVecTraits::Vector2Type src_2_x2;
+      SourceVecTraits::load(&src_2[0], src_2_x2);
 
       SourceVectorType src_a[3], src_b[3];
       src_a[0] = src_0_x2.val[0];
@@ -90,9 +94,12 @@ class SeparableFilter<FilterType, 3UL> {
       auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
       auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
 
-      auto src_0_x2 = vld1q_x2(&src_0[0]);
-      auto src_1_x2 = vld1q_x2(&src_1[0]);
-      auto src_2_x2 = vld1q_x2(&src_2[0]);
+      typename BufferVecTraits::Vector2Type src_0_x2;
+      BufferVecTraits::load(&src_0[0], src_0_x2);
+      typename BufferVecTraits::Vector2Type src_1_x2;
+      BufferVecTraits::load(&src_1[0], src_1_x2);
+      typename BufferVecTraits::Vector2Type src_2_x2;
+      BufferVecTraits::load(&src_2[0], src_2_x2);
 
       BufferVectorType src_a[3], src_b[3];
       src_a[0] = src_0_x2.val[0];
diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h
index ecafd7d38..940cf2707 100644
--- a/kleidicv/include/kleidicv/neon.h
+++ b/kleidicv/include/kleidicv/neon.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -173,6 +173,228 @@ class VecTraitsBase : public VectorTypes<ScalarType> {
   // Maximum number of lanes in a vector.
   static constexpr size_t max_num_lanes() { return num_lanes(); }
 
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
+
+ private:
+  static inline int8x16x2_t vld1q_x2(const int8_t *src) {
+    return vld1q_s8_x2(src);
+  }
+
+  static inline uint8x16x2_t vld1q_x2(const uint8_t *src) {
+    return vld1q_u8_x2(src);
+  }
+
+  static inline int16x8x2_t vld1q_x2(const int16_t *src) {
+    return vld1q_s16_x2(src);
+  }
+
+  static inline uint16x8x2_t vld1q_x2(const uint16_t *src) {
+    return vld1q_u16_x2(src);
+  }
+
+  static inline int32x4x2_t vld1q_x2(const int32_t *src) {
+    return vld1q_s32_x2(src);
+  }
+
+  static inline uint32x4x2_t vld1q_x2(const uint32_t *src) {
+    return vld1q_u32_x2(src);
+  }
+
+  static inline int64x2x2_t vld1q_x2(const int64_t *src) {
+    return vld1q_s64_x2(src);
+  }
+
+  static inline uint64x2x2_t vld1q_x2(const uint64_t *src) {
+    return vld1q_u64_x2(src);
+  }
+
+  static inline float32x4x2_t vld1q_x2(const float32_t *src) {
+    return vld1q_f32_x2(src);
+  }
+
+  static inline int8x16x3_t vld1q_x3(const int8_t *src) {
+    return vld1q_s8_x3(src);
+  }
+
+  static inline uint8x16x3_t vld1q_x3(const uint8_t *src) {
+    return vld1q_u8_x3(src);
+  }
+
+  static inline int16x8x3_t vld1q_x3(const int16_t *src) {
+    return vld1q_s16_x3(src);
+  }
+
+  static inline uint16x8x3_t vld1q_x3(const uint16_t *src) {
+    return vld1q_u16_x3(src);
+  }
+
+  static inline int32x4x3_t vld1q_x3(const int32_t *src) {
+    return vld1q_s32_x3(src);
+  }
+
+  static inline uint32x4x3_t vld1q_x3(const uint32_t *src) {
+    return vld1q_u32_x3(src);
+  }
+
+  static inline int64x2x3_t vld1q_x3(const int64_t *src) {
+    return vld1q_s64_x3(src);
+  }
+
+  static inline uint64x2x3_t vld1q_x3(const uint64_t *src) {
+    return vld1q_u64_x3(src);
+  }
+
+  static inline float32x4x3_t vld1q_x3(const float32_t *src) {
+    return vld1q_f32_x3(src);
+  }
+
+  static inline int8x16x4_t vld1q_x4(const int8_t *src) {
+    return vld1q_s8_x4(src);
+  }
+
+  static inline uint8x16x4_t vld1q_x4(const uint8_t *src) {
+    return vld1q_u8_x4(src);
+  }
+
+  static inline int16x8x4_t vld1q_x4(const int16_t *src) {
+    return vld1q_s16_x4(src);
+  }
+
+  static inline uint16x8x4_t vld1q_x4(const uint16_t *src) {
+    return vld1q_u16_x4(src);
+  }
+
+  static inline int32x4x4_t vld1q_x4(const int32_t *src) {
+    return vld1q_s32_x4(src);
+  }
+
+  static inline uint32x4x4_t vld1q_x4(const uint32_t *src) {
+    return vld1q_u32_x4(src);
+  }
+
+  static inline int64x2x4_t vld1q_x4(const int64_t *src) {
+    return vld1q_s64_x4(src);
+  }
+
+  static inline uint64x2x4_t vld1q_x4(const uint64_t *src) {
+    return vld1q_u64_x4(src);
+  }
+
+  static inline float32x4x4_t vld1q_x4(const float32_t *src) {
+    return vld1q_f32_x4(src);
+  }
+
+  static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) {
+    vst1q_s8_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) {
+    vst1q_u8_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) {
+    vst1q_s16_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) {
+    vst1q_u16_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) {
+    vst1q_s32_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) {
+    vst1q_u32_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) {
+    vst1q_s64_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) {
+    vst1q_u64_x2(dst, vec);
+  }
+
+  static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) {
+    vst1q_f32_x2(dst, vec);
+  }
+
+  static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) {
+    vst1q_s8_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) {
+    vst1q_u8_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) {
+    vst1q_s16_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) {
+    vst1q_u16_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) {
+    vst1q_s32_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) {
+    vst1q_u32_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) {
+    vst1q_s64_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) {
+    vst1q_u64_x3(dst, vec);
+  }
+
+  static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) {
+    vst1q_f32_x3(dst, vec);
+  }
+
+  static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) {
+    vst1q_s8_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) {
+    vst1q_u8_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) {
+    vst1q_s16_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) {
+    vst1q_u16_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) {
+    vst1q_s32_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) {
+    vst1q_u32_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) {
+    vst1q_s64_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) {
+    vst1q_u64_x4(dst, vec);
+  }
+
+  static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) {
+    vst1q_f32_x4(dst, vec);
+  }
+
+ public:
+#endif
+
   // Loads a single vector from 'src'.
   static inline void load(const ScalarType *src, VectorType &vec) {
     vec = vld1q(&src[0]);
@@ -180,17 +402,32 @@ class VecTraitsBase : public VectorTypes<ScalarType> {
 
   // Loads two consecutive vectors from 'src'.
   static inline void load(const ScalarType *src, Vector2Type &vec) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vec = vld1q_x2(&src[0]);
+#else
+    vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
+#endif
   }
 
   // Loads three consecutive vectors from 'src'.
   static inline void load(const ScalarType *src, Vector3Type &vec) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vec = vld1q_x3(&src[0]);
+#else
+    vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
+           vld1q(&src[0] + (2 * num_lanes()))};
+#endif
   }
 
   // Loads four consecutive vectors from 'src'.
   static inline void load(const ScalarType *src, Vector4Type &vec) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vec = vld1q_x4(&src[0]);
+#else
+    vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
+           vld1q(&src[0] + (2 * num_lanes())),
+           vld1q(&src[0] + (3 * num_lanes()))};
+#endif
   }
 
   // Loads two consecutive vectors from 'src'.
@@ -203,22 +440,47 @@ class VecTraitsBase : public VectorTypes<ScalarType> {
   // Loads 2x2 consecutive vectors from 'src'.
   static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0,
                                       Vector2Type &vec_1) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vec_0 = vld1q_x2(&src[0]);
     vec_1 = vld1q_x2(&src[num_lanes() * 2]);
+#else
+    vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())};
+    vec_1 = {vld1q(&src[num_lanes() * 2]),
+             vld1q(&src[num_lanes() * 2] + num_lanes())};
+#endif
   }
 
   // Loads 2x3 consecutive vectors from 'src'.
   static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0,
                                       Vector3Type &vec_1) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vec_0 = vld1q_x3(&src[0]);
     vec_1 = vld1q_x3(&src[num_lanes() * 3]);
+#else
+    vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
+             vld1q(&src[0] + (2 * num_lanes()))};
+    vec_1 = {vld1q(&src[num_lanes() * 3]),
+             vld1q(&src[num_lanes() * 3] + num_lanes()),
+             vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))};
+#endif
   }
 
   // Loads 2x4 consecutive vectors from 'src'.
   static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0,
                                       Vector4Type &vec_1) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vec_0 = vld1q_x4(&src[0]);
     vec_1 = vld1q_x4(&src[num_lanes() * 4]);
+
+#else
+    vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()),
+             vld1q(&src[0] + (2 * num_lanes())),
+             vld1q(&src[0] + (3 * num_lanes()))};
+    vec_1 = {vld1q(&src[num_lanes() * 4]),
+             vld1q(&src[num_lanes() * 4] + num_lanes()),
+             vld1q(&src[num_lanes() * 4] + (2 * num_lanes())),
+             vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))};
+#endif
   }
 
   // Stores a single vector to 'dst'.
@@ -228,7 +490,35 @@ class VecTraitsBase : public VectorTypes<ScalarType> {
 
   // Stores two consecutive vectors to 'dst'.
   static inline void store(Vector2Type vec, ScalarType *dst) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
     vst1q_x2(&dst[0], vec);
+#else
+    vst1q(&dst[0], vec.val[0]);
+    vst1q(&dst[0] + num_lanes(), vec.val[1]);
+#endif
+  }
+
+  // Stores three consecutive vectors to 'dst'.
+  static inline void store(Vector3Type vec, ScalarType *dst) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
+    vst1q_x3(&dst[0], vec);
+#else
+    vst1q(&dst[0], vec.val[0]);
+    vst1q(&dst[0] + num_lanes(), vec.val[1]);
+    vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
+#endif
+  }
+
+  // Stores four consecutive vectors to 'dst'.
+  static inline void store(Vector4Type vec, ScalarType *dst) {
+#if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS
+    vst1q_x4(&dst[0], vec);
+#else
+    vst1q(&dst[0], vec.val[0]);
+    vst1q(&dst[0] + num_lanes(), vec.val[1]);
+    vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]);
+    vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]);
+#endif
   }
 
   // Stores two consecutive vectors to 'dst'.
diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h
index 420a3733a..77a11c20d 100644
--- a/kleidicv/include/kleidicv/neon_intrinsics.h
+++ b/kleidicv/include/kleidicv/neon_intrinsics.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -375,36 +375,6 @@ static inline int64x2x4_t   vld4q(const int64_t   *src) { return vld4q_s64(src);
 static inline uint64x2x4_t  vld4q(const uint64_t  *src) { return vld4q_u64(src); }
 static inline float32x4x4_t vld4q(const float32_t *src) { return vld4q_f32(src); }
 
-static inline int8x16x2_t   vld1q_x2(const int8_t    *src) { return  vld1q_s8_x2(src); }
-static inline uint8x16x2_t  vld1q_x2(const uint8_t   *src) { return  vld1q_u8_x2(src); }
-static inline int16x8x2_t   vld1q_x2(const int16_t   *src) { return vld1q_s16_x2(src); }
-static inline uint16x8x2_t  vld1q_x2(const uint16_t  *src) { return vld1q_u16_x2(src); }
-static inline int32x4x2_t   vld1q_x2(const int32_t   *src) { return vld1q_s32_x2(src); }
-static inline uint32x4x2_t  vld1q_x2(const uint32_t  *src) { return vld1q_u32_x2(src); }
-static inline int64x2x2_t   vld1q_x2(const int64_t   *src) { return vld1q_s64_x2(src); }
-static inline uint64x2x2_t  vld1q_x2(const uint64_t  *src) { return vld1q_u64_x2(src); }
-static inline float32x4x2_t vld1q_x2(const float32_t *src) { return vld1q_f32_x2(src); }
-
-static inline int8x16x3_t   vld1q_x3(const int8_t    *src) { return  vld1q_s8_x3(src); }
-static inline uint8x16x3_t  vld1q_x3(const uint8_t   *src) { return  vld1q_u8_x3(src); }
-static inline int16x8x3_t   vld1q_x3(const int16_t   *src) { return vld1q_s16_x3(src); }
-static inline uint16x8x3_t  vld1q_x3(const uint16_t  *src) { return vld1q_u16_x3(src); }
-static inline int32x4x3_t   vld1q_x3(const int32_t   *src) { return vld1q_s32_x3(src); }
-static inline uint32x4x3_t  vld1q_x3(const uint32_t  *src) { return vld1q_u32_x3(src); }
-static inline int64x2x3_t   vld1q_x3(const int64_t   *src) { return vld1q_s64_x3(src); }
-static inline uint64x2x3_t  vld1q_x3(const uint64_t  *src) { return vld1q_u64_x3(src); }
-static inline float32x4x3_t vld1q_x3(const float32_t *src) { return vld1q_f32_x3(src); }
-
-static inline int8x16x4_t   vld1q_x4(const int8_t    *src) { return  vld1q_s8_x4(src); }
-static inline uint8x16x4_t  vld1q_x4(const uint8_t   *src) { return  vld1q_u8_x4(src); }
-static inline int16x8x4_t   vld1q_x4(const int16_t   *src) { return vld1q_s16_x4(src); }
-static inline uint16x8x4_t  vld1q_x4(const uint16_t  *src) { return vld1q_u16_x4(src); }
-static inline int32x4x4_t   vld1q_x4(const int32_t   *src) { return vld1q_s32_x4(src); }
-static inline uint32x4x4_t  vld1q_x4(const uint32_t  *src) { return vld1q_u32_x4(src); }
-static inline int64x2x4_t   vld1q_x4(const int64_t   *src) { return vld1q_s64_x4(src); }
-static inline uint64x2x4_t  vld1q_x4(const uint64_t  *src) { return vld1q_u64_x4(src); }
-static inline float32x4x4_t vld1q_x4(const float32_t *src) { return vld1q_f32_x4(src); }
-
 // -----------------------------------------------------------------------------
 // NEON store operations
 // -----------------------------------------------------------------------------
@@ -458,26 +428,6 @@ static inline void vst4q(int64_t   *dst, int64x2x4_t   vec) { vst4q_s64(dst, vec
 static inline void vst4q(uint64_t  *dst, uint64x2x4_t  vec) { vst4q_u64(dst, vec); }
 static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); }
 
-static inline void vst1q_x2(int8_t    *dst, int8x16x2_t   vec) { vst1q_s8_x2(dst, vec); }
-static inline void vst1q_x2(uint8_t   *dst, uint8x16x2_t  vec) { vst1q_u8_x2(dst, vec); }
-static inline void vst1q_x2(int16_t   *dst, int16x8x2_t   vec) { vst1q_s16_x2(dst, vec); }
-static inline void vst1q_x2(uint16_t  *dst, uint16x8x2_t  vec) { vst1q_u16_x2(dst, vec); }
-static inline void vst1q_x2(int32_t   *dst, int32x4x2_t   vec) { vst1q_s32_x2(dst, vec); }
-static inline void vst1q_x2(uint32_t  *dst, uint32x4x2_t  vec) { vst1q_u32_x2(dst, vec); }
-static inline void vst1q_x2(int64_t   *dst, int64x2x2_t   vec) { vst1q_s64_x2(dst, vec); }
-static inline void vst1q_x2(uint64_t  *dst, uint64x2x2_t  vec) { vst1q_u64_x2(dst, vec); }
-static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { vst1q_f32_x2(dst, vec); }
-
-static inline void vst1q_x4(int8_t    *dst, int8x16x4_t   vec) { vst1q_s8_x4(dst, vec); }
-static inline void vst1q_x4(uint8_t   *dst, uint8x16x4_t  vec) { vst1q_u8_x4(dst, vec); }
-static inline void vst1q_x4(int16_t   *dst, int16x8x4_t   vec) { vst1q_s16_x4(dst, vec); }
-static inline void vst1q_x4(uint16_t  *dst, uint16x8x4_t  vec) { vst1q_u16_x4(dst, vec); }
-static inline void vst1q_x4(int32_t   *dst, int32x4x4_t   vec) { vst1q_s32_x4(dst, vec); }
-static inline void vst1q_x4(uint32_t  *dst, uint32x4x4_t  vec) { vst1q_u32_x4(dst, vec); }
-static inline void vst1q_x4(int64_t   *dst, int64x2x4_t   vec) { vst1q_s64_x4(dst, vec); }
-static inline void vst1q_x4(uint64_t  *dst, uint64x2x4_t  vec) { vst1q_u64_x4(dst, vec); }
-static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { vst1q_f32_x4(dst, vec); }
-
 // -----------------------------------------------------------------------------
 // vreinterpret*
 // -----------------------------------------------------------------------------
diff --git a/kleidicv/src/arithmetics/in_range_neon.cpp b/kleidicv/src/arithmetics/in_range_neon.cpp
index d0614d926..f03d9ddec 100644
--- a/kleidicv/src/arithmetics/in_range_neon.cpp
+++ b/kleidicv/src/arithmetics/in_range_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -58,7 +58,9 @@ class InRange<float> {
                    Columns<uint8_t> dst) {
     LoopUnroll{width, SrcVecTraits::num_lanes()}
         .unroll_n_times<4>([&](size_t step) {
-          SrcVector4Type src_vector = vld1q_f32_x4(&src[0]);
+          SrcVector4Type src_vector;
+          SrcVecTraits::load(&src[0], src_vector);
+
           DstVectorType result_vector = vector_path(src_vector);
           vst1q(&dst[0], result_vector);
           src += ptrdiff_t(step);
diff --git a/kleidicv/src/arithmetics/scale_neon.cpp b/kleidicv/src/arithmetics/scale_neon.cpp
index 7c46ec045..60621cca9 100644
--- a/kleidicv/src/arithmetics/scale_neon.cpp
+++ b/kleidicv/src/arithmetics/scale_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -88,16 +88,18 @@ class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
     for (size_t i = 0; i < TableLength; ++i) {
       values[i] = this->scale_value(i);
     }
-    t0_3_ = vld1q_u8_x3(values);
-    t1_3_ = vld1q_u8_x3(values + 3 * VecTraits::num_lanes());
-    t2_2_ = vld1q_u8_x2(values + (3 + 3) * VecTraits::num_lanes());
-    t3_3_ = vld1q_u8_x3(values + (3 + 3 + 2) * VecTraits::num_lanes());
-    t4_2_ = vld1q_u8_x2(values + (3 + 3 + 2 + 3) * VecTraits::num_lanes());
-    t5_3_ = vld1q_u8_x3(values + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes());
+
+    VecTraits::load(values, t0_3_);
+    VecTraits::load(values + 3 * VecTraits::num_lanes(), t1_3_);
+    VecTraits::load(values + (3 + 3) * VecTraits::num_lanes(), t2_2_);
+    VecTraits::load(values + (3 + 3 + 2) * VecTraits::num_lanes(), t3_3_);
+    VecTraits::load(values + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
+    VecTraits::load(values + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
+                    t5_3_);
+
     v_step3_ = vdupq_n_u8(3 * VecTraits::num_lanes());
     v_step2_ = vdupq_n_u8(2 * VecTraits::num_lanes());
   }
-
   VectorType vector_path(VectorType src) {
     VectorType dst = vqtbl3q_u8(t0_3_, src);
     src = vsubq_u8(src, v_step3_);
diff --git a/kleidicv/src/conversions/float_conv_neon.cpp b/kleidicv/src/conversions/float_conv_neon.cpp
index e220dd914..1ef00c5eb 100644
--- a/kleidicv/src/conversions/float_conv_neon.cpp
+++ b/kleidicv/src/conversions/float_conv_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -124,13 +124,14 @@ s8_to_f32(const int8_t* src, size_t src_stride, float* dst, size_t dst_stride,
       int32x4_t c = vreinterpretq_s32_s8(vqtbl1q_s8(input, index2));
       int32x4_t d = vreinterpretq_s32_s8(vqtbl1q_s8(input, index3));
       // Convert to float and divide by 2^24.
+
       float32x4x4_t output = {
           vcvtq_n_f32_s32(a, 24),
           vcvtq_n_f32_s32(b, 24),
           vcvtq_n_f32_s32(c, 24),
           vcvtq_n_f32_s32(d, 24),
       };
-      vst1q_f32_x4(dst + x, output);
+      neon::VecTraits<float>::store(output, dst + x);
     }
     for (; x != width; ++x) {
       disable_loop_vectorization();
@@ -169,13 +170,14 @@ u8_to_f32(const uint8_t* src, size_t src_stride, float* dst, size_t dst_stride,
       uint32x4_t b = vreinterpretq_u32_u8(vqtbl1q_u8(input, index1));
       uint32x4_t c = vreinterpretq_u32_u8(vqtbl1q_u8(input, index2));
       uint32x4_t d = vreinterpretq_u32_u8(vqtbl1q_u8(input, index3));
+
       float32x4x4_t output = {
           vcvtq_f32_u32(a),
           vcvtq_f32_u32(b),
           vcvtq_f32_u32(c),
           vcvtq_f32_u32(d),
       };
-      vst1q_f32_x4(dst + x, output);
+      neon::VecTraits<float>::store(output, dst + x);
     }
     for (; x != width; ++x) {
       disable_loop_vectorization();
diff --git a/kleidicv/src/conversions/gray_to_rgb_neon.cpp b/kleidicv/src/conversions/gray_to_rgb_neon.cpp
index 6b0d15256..6aceacc3a 100644
--- a/kleidicv/src/conversions/gray_to_rgb_neon.cpp
+++ b/kleidicv/src/conversions/gray_to_rgb_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -15,7 +15,9 @@ class GrayToRGB final : public UnrollTwice {
   using VectorType = typename VecTraits::VectorType;
 
 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
-  GrayToRGB() : indices_{vld1q_u8_x3(kGrayToRGBTableIndices)} {}
+  GrayToRGB() : indices_{} {
+    VecTraits::load(kGrayToRGBTableIndices, indices_);
+  }
 #else
   GrayToRGB() = default;
 #endif
@@ -31,7 +33,7 @@ class GrayToRGB final : public UnrollTwice {
     dst_vect.val[0] = vqtbl1q_u8(src_vect, indices_.val[0]);
     dst_vect.val[1] = vqtbl1q_u8(src_vect, indices_.val[1]);
     dst_vect.val[2] = vqtbl1q_u8(src_vect, indices_.val[2]);
-    vst1q_u8_x3(dst, dst_vect);
+    VecTraits::store(dst_vect, dst);
 #endif
   }
 
@@ -41,11 +43,11 @@ class GrayToRGB final : public UnrollTwice {
 
  private:
 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
+
   static constexpr uint8_t kGrayToRGBTableIndices[48] = {
       0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  5,
       5,  5,  6,  6,  6,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10,
       10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15};
-
   uint8x16x3_t indices_;
 #endif
 };  // end of class GrayToRGB<ScalarType>
@@ -60,7 +62,8 @@ class GrayToRGBA final : public UnrollTwice {
   GrayToRGBA() : alpha_{vdupq_n_u8(0xff)} {}
 #else
   // NOLINTBEGIN(hicpp-member-init)
-  GrayToRGBA() : indices_{vld1q_u8_x4(kGrayToRGBATableIndices)} {
+  GrayToRGBA() : indices_{} {
+    VecTraits::load(kGrayToRGBATableIndices, indices_);
     src_and_alpha_.val[1] = vdupq_n_u8(0xff);
   }
   // NOLINTEND(hicpp-member-init)
@@ -80,7 +83,8 @@ class GrayToRGBA final : public UnrollTwice {
     dst_vect.val[1] = vqtbl2q_u8(src_and_alpha_, indices_.val[1]);
     dst_vect.val[2] = vqtbl2q_u8(src_and_alpha_, indices_.val[2]);
     dst_vect.val[3] = vqtbl2q_u8(src_and_alpha_, indices_.val[3]);
-    vst1q_u8_x4(dst, dst_vect);
+    VecTraits::store(dst_vect, dst);
+
 #endif
   }
 
@@ -101,6 +105,7 @@ class GrayToRGBA final : public UnrollTwice {
       4,  4,  4,  16, 5,  5,  5,  16, 6,  6,  6,  16, 7,  7,  7,  16,
       8,  8,  8,  16, 9,  9,  9,  16, 10, 10, 10, 16, 11, 11, 11, 16,
       12, 12, 12, 16, 13, 13, 13, 16, 14, 14, 14, 16, 15, 15, 15, 16};
+
 #endif
 };  // end of class GrayToRGBA<ScalarType>
 
diff --git a/kleidicv/src/conversions/merge_neon.cpp b/kleidicv/src/conversions/merge_neon.cpp
index 5b7bc0157..aaef9e8bf 100644
--- a/kleidicv/src/conversions/merge_neon.cpp
+++ b/kleidicv/src/conversions/merge_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -41,11 +41,12 @@ class Merge2 final : public UnrollTwice {
     dst_vect.val[0] = src_a;
     dst_vect.val[1] = src_b;
     vst2q(&dst[0], dst_vect);
-#else   // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
+#else  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
     Vector2Type dst_vect;
     dst_vect.val[0] = vzip1q(src_a, src_b);
     dst_vect.val[1] = vzip2q(src_a, src_b);
-    vst1q_x2(&dst[0], dst_vect);
+    VecTraits::store(dst_vect, &dst[0]);
+
 #endif  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
   }
 
@@ -68,26 +69,30 @@ class Merge3 final : public UnrollTwice {
   using Vector3Type = typename VecTraits::Vector3Type;
 
 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
-  Merge3() : table_indices_{vld1q_u8_x3(lookup_table(ScalarType()))} {}
+
+  Merge3() : table_indices_{} {
+    neon::VecTraits<uint8_t>::load(lookup_table(ScalarType()), table_indices_);
+  }
+
 #endif
 
   void vector_path(VectorType src_a, VectorType src_b, VectorType src_c,
                    ScalarType *dst) {
-#if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
     Vector3Type dst_vect;
+#if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
     dst_vect.val[0] = src_a;
     dst_vect.val[1] = src_b;
     dst_vect.val[2] = src_c;
     vst3q(&dst[0], dst_vect);
 #else   // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
-    uint8x16x3_t src_vect, dst_vect;
+    uint8x16x3_t src_vect;
     src_vect.val[0] = vreinterpretq_u8(src_a);
     src_vect.val[1] = vreinterpretq_u8(src_b);
     src_vect.val[2] = vreinterpretq_u8(src_c);
     dst_vect.val[0] = vqtbl3q_u8(src_vect, table_indices_.val[0]);
     dst_vect.val[1] = vqtbl3q_u8(src_vect, table_indices_.val[1]);
     dst_vect.val[2] = vqtbl3q_u8(src_vect, table_indices_.val[2]);
-    vst1q_u8_x3(reinterpret_cast<uint8_t *>(&dst[0]), dst_vect);
+    VecTraits::store(dst_vect, &dst[0]);
 #endif  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
   }
 
@@ -100,21 +105,20 @@ class Merge3 final : public UnrollTwice {
 
  private:
 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
-  static const uint8_t *lookup_table(uint8_t) {
+  static uint8_t *lookup_table(uint8_t) {
     // clang-format off
-    static constexpr uint8_t kIndices[48] = {
+    static  uint8_t kIndices[48] = {
        0, 16, 32,  1, 17, 33,  2, 18, 34,  3, 19, 35,  4, 20, 36,  5,
       21, 37,  6, 22, 38,  7, 23, 39,  8, 24, 40,  9, 25, 41, 10, 26,
       42, 11, 27, 43, 12, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47,
     };
-    // clang-format on
     return &kIndices[0];
   }
 
   // Lookup table for 16-bit inputs.
-  static const uint8_t *lookup_table(uint16_t) {
+  static uint8_t *lookup_table(uint16_t) {
     // clang-format off
-    static constexpr uint8_t kIndices[48] = {
+    static uint8_t kIndices[48] = {
        0,  1, 16, 17, 32, 33,  2,  3, 18, 19, 34, 35,  4,  5, 20, 21,
       36, 37,  6,  7, 22, 23, 38, 39,  8,  9, 24, 25, 40, 41, 10, 11,
       26, 27, 42, 43, 12, 13, 28, 29, 44, 45, 14, 15, 30, 31, 46, 47,
@@ -215,7 +219,8 @@ class Merge3<uint64_t> final : public UnrollTwice {
     dst_vect.val[1] = src_c;
     dst_vect.val[1][1] = src_a[1];
     dst_vect.val[2] = vzip2q_u64(src_b, src_c);
-    vst1q_u64_x3(&dst[0], dst_vect);
+
+    VecTraits::store(dst_vect, &dst[0]);
   }
 
   void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
@@ -273,7 +278,7 @@ class Merge4 final : public UnrollTwice {
     dst_vect.val[2] = src_c;
     dst_vect.val[3] = src_d;
     vst4q(&dst[0], dst_vect);
-#else   // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
+#else  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
     auto zip1_a_b = double_width(vzip1q(src_a, src_b));
     auto zip1_c_d = double_width(vzip1q(src_c, src_d));
     auto zip2_a_b = double_width(vzip2q(src_a, src_b));
@@ -287,7 +292,9 @@ class Merge4 final : public UnrollTwice {
     dst_vect.val[1] = vzip2q(zip1_a_b, zip1_c_d);
     dst_vect.val[2] = vzip1q(zip2_a_b, zip2_c_d);
     dst_vect.val[3] = vzip2q(zip2_a_b, zip2_c_d);
-    vst1q_x4(reinterpret_cast<DoubleScalarType *>(&dst[0]), dst_vect);
+    neon::VecTraits<DoubleScalarType>::store(
+        dst_vect, reinterpret_cast<DoubleScalarType *>(&dst[0]));
+
 #endif  // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
   }
 
@@ -346,7 +353,7 @@ class Merge4<uint64_t> final : public UnrollTwice {
     dst_vect.val[1] = vzip1q(src_c, src_d);
     dst_vect.val[2] = vzip2q(src_a, src_b);
     dst_vect.val[3] = vzip2q(src_c, src_d);
-    vst1q_x4(&dst[0], dst_vect);
+    VecTraits::store(dst_vect, &dst[0]);
   }
 
   void scalar_path(const ScalarType *src_a, const ScalarType *src_b,
diff --git a/kleidicv/src/conversions/rgb_to_rgb_neon.cpp b/kleidicv/src/conversions/rgb_to_rgb_neon.cpp
index 001fe94a7..c2cba62c5 100644
--- a/kleidicv/src/conversions/rgb_to_rgb_neon.cpp
+++ b/kleidicv/src/conversions/rgb_to_rgb_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -14,7 +14,7 @@ class RGBToBGR final : public UnrollTwice {
   using VecTraits = neon::VecTraits<ScalarType>;
 
 #if !KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE
-  RGBToBGR() : indices_{vld1q_u8_x3(kRGBToBGRTableIndices)} {}
+  RGBToBGR() : indices_{} { VecTraits::load(kRGBToBGRTableIndices, indices_); }
 #else
   RGBToBGR() = default;
 #endif
@@ -30,7 +30,10 @@ class RGBToBGR final : public UnrollTwice {
 
     vst3q_u8(dst, dst_vect);
 #else
-    uint8x16x3_t src_vect = vld1q_u8_x3(src);
+
+    uint8x16x3_t src_vect;
+    VecTraits::load(src, src_vect);
+
     uint8x16x3_t dst_vect;
 
     uint8x16x2_t src_vect_0_1;
@@ -45,7 +48,7 @@ class RGBToBGR final : public UnrollTwice {
     dst_vect.val[1] = vqtbl3q_u8(src_vect, indices_.val[1]);
     dst_vect.val[2] = vqtbl2q_u8(src_vect_1_2, indices_.val[2]);
 
-    vst1q_u8_x3(dst, dst_vect);
+    VecTraits::store(dst_vect, dst);
 #endif
   }
 
@@ -62,7 +65,6 @@ class RGBToBGR final : public UnrollTwice {
       2,  1,  0,  5,  4,  3,  8,  7,  6,  11, 10, 9,  14, 13, 12, 17,
       16, 15, 20, 19, 18, 23, 22, 21, 26, 25, 24, 29, 28, 27, 32, 31,
       14, 19, 18, 17, 22, 21, 20, 25, 24, 23, 28, 27, 26, 31, 30, 29};
-
   uint8x16x3_t indices_;
 #endif
 };  // end of class RGBToBGR<ScalarType>
diff --git a/kleidicv/src/conversions/rgb_to_yuv_neon.cpp b/kleidicv/src/conversions/rgb_to_yuv_neon.cpp
index c8b89502a..55a8b8f0d 100644
--- a/kleidicv/src/conversions/rgb_to_yuv_neon.cpp
+++ b/kleidicv/src/conversions/rgb_to_yuv_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -28,7 +28,8 @@ class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
     RawSourceVectorType vsrc;
     int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
     if constexpr (ALPHA) {
-      vsrc = vld1q_u8_x4(src);
+      VecTraits::load(src, vsrc);
+
       uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
       uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
       if constexpr (BGR) {
diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp
index 590894cde..1deacceae 100644
--- a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp
+++ b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -28,8 +28,11 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop {
                             128 * (kUVWeights[1] + kUVWeights[2]))},
         b_base_{vdupq_n_s32(static_cast<int32_t>(1 << (kWeightScale - 1)) -
                             128 * kUVWeights[3])},
-        de_interleave_indices_{vld1q_s8_x4(kDeInterleaveTableIndices)},
-        is_nv21_(is_nv21) {}
+        de_interleave_indices_{},
+        is_nv21_(is_nv21) {
+    neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
+                                  de_interleave_indices_);
+  }
 
   // Returns the number of channels in the output image.
   static constexpr size_t output_channels() {
@@ -266,8 +269,8 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop {
   int8x16x4_t de_interleave_indices_;
 
   const bool is_nv21_;
-
   // clang-format off
+
   static constexpr int8_t kDeInterleaveTableIndices[64] = {
       /* low and even */
       0, -1, -1, -1,  2, -1, -1, -1,  4, -1, -1, -1,  6, -1, -1, -1,
@@ -278,6 +281,7 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop {
       /* high and odd */
       9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
   };
+
   // clang-format on
 };  // end of class YUVSpToRGBxOrBGRx<bool, bool>
 
diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp
index af3b5baef..b28de5876 100644
--- a/kleidicv/src/filters/gaussian_blur_neon.cpp
+++ b/kleidicv/src/filters/gaussian_blur_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -37,7 +37,7 @@ class GaussianBlur<uint8_t, 3, true> {
   using BufferVectorType = typename VecTraits<BufferType>::VectorType;
   using DestinationType = ScalarType;
 
-  explicit GaussianBlur([[maybe_unused]] float sigma) {}
+  explicit GaussianBlur([[maybe_unused]] float sigma)
 
   // Applies vertical filtering vector using SIMD operations.
   //
@@ -179,7 +179,7 @@ class GaussianBlur<uint8_t, 7, true> {
   using BufferType = uint16_t;
   using DestinationType = uint8_t;
 
-  explicit GaussianBlur([[maybe_unused]] float sigma)
+explicit GaussianBlur([[maybe_unused]] float sigma)
       : const_7_u16_{vdupq_n_u16(7)},
         const_7_u32_{vdupq_n_u32(7)},
         const_9_u16_{vdupq_n_u16(9)} {}
@@ -434,8 +434,7 @@ class GaussianBlur<uint8_t, 15, true> {
         vmlal_u16(acc.val[2], vget_low_u16(acc_7_h), const_158_u16_half_);
     acc.val[3] =
         vmlal_u16(acc.val[3], vget_high_u16(acc_7_h), const_158_u16_half_);
-
-    vst1q_u32_x4(&dst[0], acc);
+    neon::VecTraits<uint32_t>::store(acc, &dst[0]);
   }
 
   // Applies vertical filtering vector using scalar operations.
@@ -565,8 +564,7 @@ class GaussianBlur<uint8_t, KernelSize, false> {
     }
 
     uint32x4x4_t result = {acc_l_l, acc_l_h, acc_h_l, acc_h_h};
-
-    vst1q_u32_x4(&dst[0], result);
+    neon::VecTraits<uint32_t>::store(result, &dst[0]);
   }
 
   void vertical_scalar_path(const SourceType src[KernelSize],
diff --git a/kleidicv/src/resize/resize_linear_neon.cpp b/kleidicv/src/resize/resize_linear_neon.cpp
index 7b6d1d54d..e56bfde67 100644
--- a/kleidicv/src/resize/resize_linear_neon.cpp
+++ b/kleidicv/src/resize/resize_linear_neon.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -6,6 +6,7 @@
 
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
+#include "kleidicv/operations.h"
 #include "kleidicv/resize/resize_linear.h"
 
 namespace kleidicv::neon {
@@ -887,38 +888,43 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32(
           lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d);
       dst_0.val[1] =
           lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d);
-      vst1q_x2(dst_row0, dst_0);
 
+      neon::VecTraits<float>::store(dst_0, dst_row0);
       float32x4x2_t dst_7;
       dst_7.val[0] =
           lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d);
       dst_7.val[1] =
           lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d);
-      vst1q_x2(dst_row7, dst_7);
 
+      neon::VecTraits<float>::store(dst_7, dst_row7);
       float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]);
       float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]);
 
       float32x4x2_t dst;
       dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0);
       dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1);
-      vst1q_x2(dst_row1, dst);
+
+      neon::VecTraits<float>::store(dst, dst_row1);
       dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0);
       dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1);
-      vst1q_x2(dst_row2, dst);
+
+      neon::VecTraits<float>::store(dst, dst_row2);
       dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0);
       dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1);
-      vst1q_x2(dst_row3, dst);
+
+      neon::VecTraits<float>::store(dst, dst_row3);
       dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0);
       dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1);
-      vst1q_x2(dst_row4, dst);
+
+      neon::VecTraits<float>::store(dst, dst_row4);
       dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0);
       dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1);
-      vst1q_x2(dst_row5, dst);
+
+      neon::VecTraits<float>::store(dst, dst_row5);
       dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0);
       dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1);
-      vst1q_x2(dst_row6, dst);
 
+      neon::VecTraits<float>::store(dst, dst_row6);
       dst_row0 += 8;
       dst_row1 += 8;
       dst_row2 += 8;
diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp
index 40d8fa85a..102d7d3c9 100644
--- a/kleidicv/src/transform/remap_s16point5_neon.cpp
+++ b/kleidicv/src/transform/remap_s16point5_neon.cpp
@@ -602,7 +602,8 @@ inline uint8x16_t load_4px_4ch(Rows<const uint8_t> src_rows,
 }
 
 inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns<uint8_t> dst) {
-  vst1q_u8_x2(&dst[0], res);
+  using ScalarType = uint8_t;
+  neon::VecTraits<ScalarType>::store(res, &dst[0]);
 }
 
 inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
@@ -612,7 +613,8 @@ inline uint16x8_t load_2px_4ch(Rows<const uint16_t> src_rows,
 }
 
 inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns<uint16_t> dst) {
-  vst1q_u16_x4(&dst[0], res);
+  using ScalarType = uint16_t;
+  neon::VecTraits<ScalarType>::store(res, &dst[0]);
 }
 
 // Replicate border specific functions
-- 
GitLab


From ab857d42e21e3774df96b3a562813dca40795072 Mon Sep 17 00:00:00 2001
From: Luna Lamb <luna.lamb@arm.com>
Date: Thu, 27 Mar 2025 11:14:14 +0000
Subject: [PATCH 2/4] Remove cfloat/climits headers from resize test api

---
 test/api/test_resize_linear.cpp | 62 ++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/test/api/test_resize_linear.cpp b/test/api/test_resize_linear.cpp
index 576ec8684..409741c28 100644
--- a/test/api/test_resize_linear.cpp
+++ b/test/api/test_resize_linear.cpp
@@ -1,11 +1,10 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include <cfloat>
 #include <cstdint>
 #include <random>
 #include <type_traits>
@@ -623,11 +622,14 @@ INSTANTIATE_TEST_SUITE_P(
         Pf32{{{0, 255}},
              {{0, 63.75F, 191.25F, 255}, {0, 63.75F, 191.25F, 255}}},
         // 2*2 -> 4*4
-        Pf32{{{FLT_MAX, 1e38F}, {0, FLT_TRUE_MIN}},
-             {{FLT_MAX, 2.8021173e38F, 1.6007058e38F, 1e38F},
+        Pf32{{{std::numeric_limits<float>::max(), 1e38F},
+              {0, std::numeric_limits<float>::denorm_min()}},
+             {{std::numeric_limits<float>::max(), 2.8021173e38F, 1.6007058e38F,
+               1e38F},
               {2.5521173e38F, 2.101588e38F, 1.2005294e38F, 7.5e37F},
               {8.5070577e37F, 7.0052933e37F, 4.0017645e37F, 2.5e37F},
-              {0, 0, FLT_TRUE_MIN, FLT_TRUE_MIN}}},
+              {0, 0, std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min()}}},
         // 3*3 -> 6*6
         Pf32{{{1, 63, 164}, {28, 251, 35}, {218, 64, 99}},
              {{1, 16.5F, 47.5F, 88.25F, 138.75F, 164},
@@ -655,12 +657,13 @@ INSTANTIATE_TEST_SUITE_P(
               37.5F},
              {200, 175, 125, 112.5F, 137.5F, 125, 75, 50}}},
         // 35*2 -> 70*4
+        // clang-format off
         Pf32{{{0,   1,   2,       3,       4,   5,   6,   7,   8,
-               9,   10,  FLT_MAX, FLT_MAX, 104, 108, 227, 46,  162,
+               9,   10,  std::numeric_limits<float>::max(), std::numeric_limits<float>::max(), 104, 108, 227, 46,  162,
                21,  220, 235,     183,     113, 225, 146, 196, 144,
                104, 148, 19,      126,     172, 9,   12,  61},
-              {4,       5,   6,   7,   8,  9,   10, 11,  12,  13,  14, FLT_MAX,
-               FLT_MAX, 105, 191, 106, 73, 148, 13, 161, 118, 21,  3,  34,
+              {4,       5,   6,   7,   8,  9,   10, 11,  12,  13,  14, std::numeric_limits<float>::max(),
+               std::numeric_limits<float>::max(), 105, 191, 106, 73, 148, 13, 161, 118, 21,  3,  34,
                40,      150, 120, 68,  75, 14,  31, 124, 221, 214, 146}},
              {{0.0F,         0.25F,         0.75F,         1.25F,
                1.75F,        2.25F,         2.75F,         3.25F,
@@ -735,10 +738,10 @@ INSTANTIATE_TEST_SUITE_P(
                196.75F,      219.25F,       215.75F,       197.0F,
                163.0F,       146.0F}}},
         // 2*2 -> 8*8
-        Pf32{{{FLT_MAX, 1e38F}, {0, FLT_TRUE_MIN}},
-             {{FLT_MAX, FLT_MAX, 3.10247e38F, 2.5017644e38F, 1.9010587e38F,
+        Pf32{{{std::numeric_limits<float>::max(), 1e38F}, {0, std::numeric_limits<float>::denorm_min()}},
+             {{std::numeric_limits<float>::max(), std::numeric_limits<float>::max(), 3.10247e38F, 2.5017644e38F, 1.9010587e38F,
                1.3003528e38F, 1e38F, 1e38F},
-              {FLT_MAX, FLT_MAX, 3.1024702e38F, 2.5017644e38F, 1.9010587e38F,
+              {std::numeric_limits<float>::max(), std::numeric_limits<float>::max(), 3.1024702e38F, 2.5017644e38F, 1.9010587e38F,
                1.3003528e38F, 1e38F, 1e38F},
               {2.9774701e38F, 2.9774701e38F, 2.7146614e38F, 2.189044e38F,
                1.6634263e38F, 1.1378087e38F, 8.75e37F, 8.75e37F},
@@ -748,10 +751,36 @@ INSTANTIATE_TEST_SUITE_P(
                7.1289698e37F, 4.8763233e37F, 3.75e37F, 3.75e37F},
               {4.2535288e37F, 4.2535288e37F, 3.8780877e37F, 3.1272055e37F,
                2.3763234e37F, 1.6254411e37F, 1.25e37F, 1.25e37F},
-              {0, 0, 0, 0, FLT_TRUE_MIN, FLT_TRUE_MIN, FLT_TRUE_MIN,
-               FLT_TRUE_MIN},
-              {0, 0, 0, 0, FLT_TRUE_MIN, FLT_TRUE_MIN, FLT_TRUE_MIN,
-               FLT_TRUE_MIN}}},
+              {0, 0, 0, 0, std::numeric_limits<float>::denorm_min(), std::numeric_limits<float>::denorm_min(), std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min()},
+              {0, 0, 0, 0, std::numeric_limits<float>::denorm_min(), std::numeric_limits<float>::denorm_min(), std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min()}}},
+        // 35*2 -> 140*8
+        Pf32{{{std::numeric_limits<float>::max(), 1e38F},
+              {0, std::numeric_limits<float>::denorm_min()}},
+             {{std::numeric_limits<float>::max(),
+               std::numeric_limits<float>::max(), 3.10247e38F, 2.5017644e38F,
+               1.9010587e38F, 1.3003528e38F, 1e38F, 1e38F},
+              {std::numeric_limits<float>::max(),
+               std::numeric_limits<float>::max(), 3.1024702e38F, 2.5017644e38F,
+               1.9010587e38F, 1.3003528e38F, 1e38F, 1e38F},
+              {2.9774701e38F, 2.9774701e38F, 2.7146614e38F, 2.189044e38F,
+               1.6634263e38F, 1.1378087e38F, 8.75e37F, 8.75e37F},
+              {2.1267644e38F, 2.1267644e38F, 1.9390438e38F, 1.5636028e38F,
+               1.1881617e38F, 8.1272051e37F, 6.25e37F, 6.25e37F},
+              {1.2760587e38F, 1.2760587e38F, 1.1634263e38F, 9.3816164e37F,
+               7.1289698e37F, 4.8763233e37F, 3.75e37F, 3.75e37F},
+              {4.2535288e37F, 4.2535288e37F, 3.8780877e37F, 3.1272055e37F,
+               2.3763234e37F, 1.6254411e37F, 1.25e37F, 1.25e37F},
+              {0, 0, 0, 0, std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min()},
+              {0, 0, 0, 0, std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min(),
+               std::numeric_limits<float>::denorm_min()}}},
+        // clang-format on
         // 35*2 -> 140*8
         Pf32{{{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  82,
                155, 104, 108, 227, 46,  162, 21,  220, 235, 183, 113, 225,
@@ -968,7 +997,8 @@ INSTANTIATE_TEST_SUITE_P(
                216.625F, 214.875F, 205.5F,   188.5F,   171.5F,   154.5F,
                146,      146}}},
         // 2*2 -> 16*16
-        Pf32{{{FLT_MAX, 1e38F}, {0, FLT_TRUE_MIN}},
+        Pf32{{{std::numeric_limits<float>::max(), 1e38F},
+              {0, std::numeric_limits<float>::denorm_min()}},
              {
                  {3.402823466e+38F, 3.402823466e+38F, 3.402823466e+38F,
                   3.402823466e+38F, 3.252647029e+38F, 2.952294156e+38F,
-- 
GitLab


From b7a40ef97d0a6f1b08b912714a630ae7d30972af Mon Sep 17 00:00:00 2001
From: Luna Lamb <luna.lamb@arm.com>
Date: Fri, 28 Feb 2025 12:03:34 +0000
Subject: [PATCH 3/4] Refactor maybe_unused statements in neon gaussian blur

Prior formatting for maybe_unused statements is not supported on GCC8, code is refactored to increase compiler compatibility.
---
 kleidicv/src/filters/gaussian_blur_neon.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp
index b28de5876..5503b1eb8 100644
--- a/kleidicv/src/filters/gaussian_blur_neon.cpp
+++ b/kleidicv/src/filters/gaussian_blur_neon.cpp
@@ -37,7 +37,7 @@ class GaussianBlur<uint8_t, 3, true> {
   using BufferVectorType = typename VecTraits<BufferType>::VectorType;
   using DestinationType = ScalarType;
 
-  explicit GaussianBlur([[maybe_unused]] float sigma)
+  explicit GaussianBlur(float sigma [[maybe_unused]]) {}
 
   // Applies vertical filtering vector using SIMD operations.
   //
@@ -98,7 +98,7 @@ class GaussianBlur<uint8_t, 5, true> {
   using BufferType = uint16_t;
   using DestinationType = uint8_t;
 
-  explicit GaussianBlur([[maybe_unused]] float sigma)
+  explicit GaussianBlur(float sigma [[maybe_unused]])
       : const_6_u8_half_{vdup_n_u8(6)},
         const_6_u16_{vdupq_n_u16(6)},
         const_4_u16_{vdupq_n_u16(4)} {}
@@ -179,7 +179,7 @@ class GaussianBlur<uint8_t, 7, true> {
   using BufferType = uint16_t;
   using DestinationType = uint8_t;
 
-explicit GaussianBlur([[maybe_unused]] float sigma)
+  explicit GaussianBlur(float sigma [[maybe_unused]])
       : const_7_u16_{vdupq_n_u16(7)},
         const_7_u32_{vdupq_n_u32(7)},
         const_9_u16_{vdupq_n_u16(9)} {}
@@ -327,7 +327,7 @@ class GaussianBlur<uint8_t, 15, true> {
   using BufferType = uint32_t;
   using DestinationType = uint8_t;
 
-  explicit GaussianBlur([[maybe_unused]] float sigma)
+  explicit GaussianBlur(float sigma [[maybe_unused]])
       : const_11_u16_{vdupq_n_u16(11)},
         const_11_u32_{vdupq_n_u32(11)},
         const_25_u16_{vdupq_n_u16(25)},
-- 
GitLab


From 74975482444561e654d5cd6ac13b7c468264201a Mon Sep 17 00:00:00 2001
From: Luna Lamb <luna.lamb@arm.com>
Date: Mon, 24 Mar 2025 12:59:27 +0000
Subject: [PATCH 4/4] Extend CI script to test MultiVec flag

---
 scripts/ci.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/ci.sh b/scripts/ci.sh
index f2d097b8c..a48b29aa7 100755
--- a/scripts/ci.sh
+++ b/scripts/ci.sh
@@ -95,7 +95,7 @@ if [[ $(dpkg --print-architecture) = arm64 ]]; then
   build/ci/sanitize/test/api/kleidicv-api-test
 fi
 
-# Build benchmarks, just to prevent bitrot.
+# Build benchmarks and without continuous load/store code path, just to prevent bitrot.
 cmake -S . -B build/ci/build-benchmark -G Ninja \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \
@@ -107,7 +107,8 @@ cmake -S . -B build/ci/build-benchmark -G Ninja \
   -DKLEIDICV_BENCHMARK=ON \
   -DKLEIDICV_ENABLE_SME2=ON \
   -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF \
-  -DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF
+  -DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF \
+  -DKLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS=OFF
 ninja -C build/ci/build-benchmark kleidicv-benchmark
 
 # TODO: Cross-build OpenCV
-- 
GitLab