From 55de8847e68219a64c6d439567bf81de38ef8f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Mon, 5 Aug 2024 12:27:33 +0200 Subject: [PATCH 1/2] Adjust Separable Filter 2D for performance The following optimizations were carried out: - Using kernel vectors instead of immediate values in vector intrinsics - Using combined widen/multiply and widen/multiply-accumulate intrinsics (now possible due to the previous change) - Prefer "high" versions of these intrinsics (for NEON) - Using a bigger intermediate type (uint16_t), thus avoiding extra narrowing inbetween vertical and horizontal code paths --- .../src/filters/separable_filter_2d_neon.cpp | 91 ++++++---- kleidicv/src/filters/separable_filter_2d_sc.h | 167 +++++++++++------- test/api/test_separable_filter_2d.cpp | 14 +- 3 files changed, 167 insertions(+), 105 deletions(-) diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 8d3d0d3ed..0c424fca1 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -19,65 +19,76 @@ template <> class SeparableFilter2D { public: using SourceType = uint8_t; - using BufferType = uint8_t; + using SourceVectorType = typename VecTraits::VectorType; + using BufferType = uint16_t; + using BufferVectorType = typename VecTraits::VectorType; using DestinationType = uint8_t; + using DestinationVectorType = typename VecTraits::VectorType; + + // NOLINTNEXTLINE - hicpp-member-init + SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y) + : kernel_x_(kernel_x), kernel_y_(kernel_y) { + for (size_t i = 0; i < 5; i++) { + kernel_x_u16_[i] = vdupq_n_u16(kernel_x[i]); + kernel_y_u8_[i] = vdupq_n_u8(kernel_y[i]); + } + } - explicit SeparableFilter2D(const uint8_t *kernel_x, const uint8_t *kernel_y) - : kernel_x_(kernel_x), kernel_y_(kernel_y) {} + void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const { + SourceVectorType acc_l = + vmull_u8(vget_low_u8(src[0]), vget_low_u8(kernel_y_u8_[0])); + SourceVectorType acc_h = vmull_high_u8(src[0], kernel_y_u8_[0]); + + // Optimization to avoid unnecessary branching in vector code. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t i = 1; i < 5; i++) { + acc_l = + vmlal_u8(acc_l, vget_low_u8(src[i]), vget_low_u8(kernel_y_u8_[i])); + acc_h = vmlal_high_u8(acc_h, src[i], kernel_y_u8_[i]); + } - void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const { - this->vector_path_with_kernel(src, dst, kernel_y_); + vst1q_u16(&dst[0], acc_l); + vst1q_u16(&dst[8], acc_h); } void vertical_scalar_path(const SourceType src[5], BufferType *dst) const { - this->scalar_path_with_kernel(src, dst, kernel_y_); - } + BufferType acc = static_cast(src[0]) * kernel_y_[0]; + for (size_t i = 1; i < 5; i++) { + BufferType temp = static_cast(src[i]) * kernel_y_[i]; + if (__builtin_add_overflow(acc, temp, &acc)) { + dst[0] = std::numeric_limits::max(); + return; + } + } - void horizontal_vector_path(uint8x16_t src[5], DestinationType *dst) const { - this->vector_path_with_kernel(src, dst, kernel_x_); + dst[0] = acc; } - void horizontal_scalar_path(const BufferType src[5], + void horizontal_vector_path(BufferVectorType src[5], DestinationType *dst) const { - this->scalar_path_with_kernel(src, dst, kernel_x_); - } - - private: - void vector_path_with_kernel(uint8x16_t src[5], uint8_t *dst, - const uint8_t *kernel) const { - uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); - uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); - - acc_l = vmulq_n_u16(acc_l, kernel[0]); - acc_h = vmulq_n_u16(acc_h, kernel[0]); + BufferVectorType acc = vmulq_u16(src[0], kernel_x_u16_[0]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i < 5; i++) { - uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); - uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); - - acc_l = vmlaq_n_u16(acc_l, vec_l, kernel[i]); - acc_h = vmlaq_n_u16(acc_h, vec_h, kernel[i]); + acc = vmlaq_u16(acc, src[i], kernel_x_u16_[i]); } - uint8x8_t result_l = vqmovn_u16(acc_l); - uint8x16_t result = vqmovn_high_u16(result_l, acc_h); - - vst1q_u8(&dst[0], result); + uint8x8_t result = vqmovn_u16(acc); + vst1_u8(&dst[0], result); } - void scalar_path_with_kernel(const uint8_t src[5], uint8_t *dst, - const uint8_t *kernel) const { - uint8_t acc; // NOLINT - if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { + void horizontal_scalar_path(const BufferType src[5], + DestinationType *dst) const { + SourceType acc; // NOLINT + if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { dst[0] = std::numeric_limits::max(); return; } for (size_t i = 1; i < 5; i++) { - uint8_t temp; // NOLINT - if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { + SourceType temp; // NOLINT + if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { dst[0] = std::numeric_limits::max(); return; } @@ -90,8 +101,12 @@ class SeparableFilter2D { dst[0] = acc; } - const uint8_t *kernel_x_; - const uint8_t *kernel_y_; + private: + const SourceType *kernel_x_; + const SourceType *kernel_y_; + + BufferVectorType kernel_x_u16_[5]; + SourceVectorType kernel_y_u8_[5]; }; KLEIDICV_TARGET_FN_ATTRS diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 9ba9c9fb8..1df5a7aed 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -20,38 +20,95 @@ template <> class SeparableFilter2D { public: using SourceType = uint8_t; - using BufferType = uint8_t; + using SourceVectorType = typename VecTraits::VectorType; + using BufferType = uint16_t; + using BufferVectorType = typename VecTraits::VectorType; + using BufferDoubleVectorType = typename VecTraits::Vector2Type; using DestinationType = uint8_t; - explicit SeparableFilter2D(const uint8_t *kernel_x, const uint8_t *kernel_y) - : kernel_x_(kernel_x), kernel_y_(kernel_y) {} + SeparableFilter2D( + const SourceType *kernel_x, BufferVectorType &kernel_x_0_u16, + BufferVectorType &kernel_x_1_u16, BufferVectorType &kernel_x_2_u16, + BufferVectorType &kernel_x_3_u16, BufferVectorType &kernel_x_4_u16, + SourceVectorType &kernel_y_0_u8, SourceVectorType &kernel_y_1_u8, + SourceVectorType &kernel_y_2_u8, SourceVectorType &kernel_y_3_u8, + SourceVectorType &kernel_y_4_u8) + : kernel_x_(kernel_x), + kernel_x_0_u16_(kernel_x_0_u16), + kernel_x_1_u16_(kernel_x_1_u16), + kernel_x_2_u16_(kernel_x_2_u16), + kernel_x_3_u16_(kernel_x_3_u16), + kernel_x_4_u16_(kernel_x_4_u16), + + kernel_y_0_u8_(kernel_y_0_u8), + kernel_y_1_u8_(kernel_y_1_u8), + kernel_y_2_u8_(kernel_y_2_u8), + kernel_y_3_u8_(kernel_y_3_u8), + kernel_y_4_u8_(kernel_y_4_u8) {} + + void vertical_vector_path( + svbool_t pg, SourceVectorType src_0, SourceVectorType src_1, + SourceVectorType src_2, SourceVectorType src_3, SourceVectorType src_4, + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + // 0 + BufferVectorType acc_b = svmullb_u16(src_0, kernel_y_0_u8_); + BufferVectorType acc_t = svmullt_u16(src_0, kernel_y_0_u8_); + + // 1 + acc_b = svmlalb_u16(acc_b, src_1, kernel_y_1_u8_); + acc_t = svmlalt_u16(acc_t, src_1, kernel_y_1_u8_); + + // 2 + acc_b = svmlalb_u16(acc_b, src_2, kernel_y_2_u8_); + acc_t = svmlalt_u16(acc_t, src_2, kernel_y_2_u8_); + + // 3 + acc_b = svmlalb_u16(acc_b, src_3, kernel_y_3_u8_); + acc_t = svmlalt_u16(acc_t, src_3, kernel_y_3_u8_); + + // 4 + acc_b = svmlalb_u16(acc_b, src_4, kernel_y_4_u8_); + acc_t = svmlalt_u16(acc_t, src_4, kernel_y_4_u8_); - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, - BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - this->vector_path_with_kernel(pg, src_0, src_1, src_2, src_3, src_4, dst, - kernel_y_); + BufferDoubleVectorType interleaved = svcreate2_u16(acc_b, acc_t); + svst2(pg, &dst[0], interleaved); } - void horizontal_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, - DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - this->vector_path_with_kernel(pg, src_0, src_1, src_2, src_3, src_4, dst, - kernel_x_); + void horizontal_vector_path( + svbool_t pg, BufferVectorType src_0, BufferVectorType src_1, + BufferVectorType src_2, BufferVectorType src_3, BufferVectorType src_4, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + // 0 + BufferVectorType acc = svmul_u16_x(pg, src_0, kernel_x_0_u16_); + + // 1 + acc = svmla_u16_x(pg, acc, src_1, kernel_x_1_u16_); + + // 2 + acc = svmla_u16_x(pg, acc, src_2, kernel_x_2_u16_); + + // 3 + acc = svmla_u16_x(pg, acc, src_3, kernel_x_3_u16_); + + // 4 + acc = svmla_u16_x(pg, acc, src_4, kernel_x_4_u16_); + + svbool_t greater = + svcmpgt_n_u16(pg, acc, std::numeric_limits::max()); + acc = svdup_n_u16_m(acc, greater, std::numeric_limits::max()); + svst1b_u16(pg, &dst[0], acc); } void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - uint8_t acc; // NOLINT + SourceType acc; // NOLINT if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { dst[0] = std::numeric_limits::max(); return; } for (size_t i = 1; i < 5; i++) { - uint8_t temp; // NOLINT + SourceType temp; // NOLINT if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { dst[0] = std::numeric_limits::max(); return; @@ -66,52 +123,19 @@ class SeparableFilter2D { } private: - void vector_path_with_kernel( - svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, - svuint8_t src_3, svuint8_t src_4, BufferType *dst, - const uint8_t *kernel) const KLEIDICV_STREAMING_COMPATIBLE { - // 0 - svuint16_t acc_b = svmovlb_u16(src_0); - svuint16_t acc_t = svmovlt_u16(src_0); - - acc_b = svmul_n_u16_x(pg, acc_b, kernel[0]); - acc_t = svmul_n_u16_x(pg, acc_t, kernel[0]); - - // 1 - svuint16_t vec_1_b = svmovlb_u16(src_1); - svuint16_t vec_1_t = svmovlt_u16(src_1); - - acc_b = svmla_n_u16_x(pg, acc_b, vec_1_b, kernel[1]); - acc_t = svmla_n_u16_x(pg, acc_t, vec_1_t, kernel[1]); - - // 2 - svuint16_t vec_2_b = svmovlb_u16(src_2); - svuint16_t vec_2_t = svmovlt_u16(src_2); - - acc_b = svmla_n_u16_x(pg, acc_b, vec_2_b, kernel[2]); - acc_t = svmla_n_u16_x(pg, acc_t, vec_2_t, kernel[2]); - - // 3 - svuint16_t vec_3_b = svmovlb_u16(src_3); - svuint16_t vec_3_t = svmovlt_u16(src_3); - - acc_b = svmla_n_u16_x(pg, acc_b, vec_3_b, kernel[3]); - acc_t = svmla_n_u16_x(pg, acc_t, vec_3_t, kernel[3]); - - // 4 - svuint16_t vec_4_b = svmovlb_u16(src_4); - svuint16_t vec_4_t = svmovlt_u16(src_4); - - acc_b = svmla_n_u16_x(pg, acc_b, vec_4_b, kernel[4]); - acc_t = svmla_n_u16_x(pg, acc_t, vec_4_t, kernel[4]); - - svuint8_t result_b = svqxtnb_u16(acc_b); - svuint8_t result = svqxtnt_u16(result_b, acc_t); - svst1_u8(pg, &dst[0], result); - } - - const uint8_t *kernel_x_; - const uint8_t *kernel_y_; + const SourceType *kernel_x_; + + BufferVectorType &kernel_x_0_u16_; + BufferVectorType &kernel_x_1_u16_; + BufferVectorType &kernel_x_2_u16_; + BufferVectorType &kernel_x_3_u16_; + BufferVectorType &kernel_x_4_u16_; + + SourceVectorType &kernel_y_0_u8_; + SourceVectorType &kernel_y_1_u8_; + SourceVectorType &kernel_y_2_u8_; + SourceVectorType &kernel_y_3_u8_; + SourceVectorType &kernel_y_4_u8_; }; // end of class SeparableFilter2D static kleidicv_error_t separable_filter_2d_u8_sc( @@ -158,7 +182,22 @@ static kleidicv_error_t separable_filter_2d_u8_sc( using SeparableFilterClass = SeparableFilter2D; - SeparableFilterClass filterClass{kernel_x, kernel_y}; + svuint16_t kernel_x_0_u16 = svdup_n_u16(kernel_x[0]); + svuint16_t kernel_x_1_u16 = svdup_n_u16(kernel_x[1]); + svuint16_t kernel_x_2_u16 = svdup_n_u16(kernel_x[2]); + svuint16_t kernel_x_3_u16 = svdup_n_u16(kernel_x[3]); + svuint16_t kernel_x_4_u16 = svdup_n_u16(kernel_x[4]); + + svuint8_t kernel_y_0_u8 = svdup_n_u8(kernel_y[0]); + svuint8_t kernel_y_1_u8 = svdup_n_u8(kernel_y[1]); + svuint8_t kernel_y_2_u8 = svdup_n_u8(kernel_y[2]); + svuint8_t kernel_y_3_u8 = svdup_n_u8(kernel_y[3]); + svuint8_t kernel_y_4_u8 = svdup_n_u8(kernel_y[4]); + + SeparableFilterClass filterClass{ + kernel_x, kernel_x_0_u16, kernel_x_1_u16, kernel_x_2_u16, + kernel_x_3_u16, kernel_x_4_u16, kernel_y_0_u8, kernel_y_1_u8, + kernel_y_2_u8, kernel_y_3_u8, kernel_y_4_u8}; SeparableFilter filter{filterClass}; Rows src_rows{src, src_stride, channels}; diff --git a/test/api/test_separable_filter_2d.cpp b/test/api/test_separable_filter_2d.cpp index 4da5b6b46..10a6ca47d 100644 --- a/test/api/test_separable_filter_2d.cpp +++ b/test/api/test_separable_filter_2d.cpp @@ -162,7 +162,7 @@ TYPED_TEST(SeparableFilter2D, 5x5Overflow) { // clang-format on test::Array2D kernel_x{5, 1}; - kernel_x.set(0, 0, {9, 9, 9, 9, 9}); + kernel_x.set(0, 0, {1, 2, 3, 4, 5}); test::Array2D kernel_y{5, 1}; kernel_y.set(0, 0, {5, 6, 7, 8, 9}); @@ -172,6 +172,14 @@ TYPED_TEST(SeparableFilter2D, 5x5Overflow) { 5, 5, 1, kernel_x.data(), 5, kernel_y.data(), 5, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + // clang-format off + src.set(0, 0, { 255, 255, 255, 255, 255}); + src.set(1, 0, { 255, 255, 255, 255, 255}); + src.set(2, 0, { 255, 255, 255, 255, 255}); + src.set(3, 0, { 255, 255, 255, 255, 255}); + src.set(4, 0, { 255, 255, 255, 255, 255}); + // clang-format on + test::Array2D dst_expected{5, 5, test::Options::vector_length()}; // clang-format off dst_expected.set(0, 0, { 255, 255, 255, 255, 255}); @@ -182,8 +190,8 @@ TYPED_TEST(SeparableFilter2D, 5x5Overflow) { // clang-format on EXPECT_EQ_ARRAY2D(dst_expected, dst); - kernel_x.set(0, 0, {0, 1, 2, 3, 4}); - kernel_y.set(0, 0, {9, 9, 9, 9, 9}); + kernel_x.set(0, 0, {255, 255, 255, 255, 255}); + kernel_y.set(0, 0, {255, 255, 255, 255, 255}); EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( src.data(), src.stride(), dst.data(), dst.stride(), -- GitLab From 0ea0990845201b710a1d91d3bd3712edb34a7a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 9 Aug 2024 15:31:06 +0200 Subject: [PATCH 2/2] Split Separable Filter 2D checks into a function This can allow the compiler to become certain that the main operation function should be inlined. --- .../src/filters/separable_filter_2d_neon.cpp | 38 ++++++++++++++----- kleidicv/src/filters/separable_filter_2d_sc.h | 38 ++++++++++++++----- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 0c424fca1..eefcd58b3 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -109,15 +109,13 @@ class SeparableFilter2D { SourceVectorType kernel_y_u8_[5]; }; -KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t separable_filter_2d_u8( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height, size_t channels, const uint8_t *kernel_x, - size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, - kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { - CHECK_POINTERS(context, kernel_x, kernel_y); - auto *workspace = reinterpret_cast(context); - auto fixed_border_type = get_fixed_border_type(border_type); +template +static kleidicv_error_t separable_filter_2d_checks( + const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, + size_t height, size_t channels, const T *kernel_x, size_t kernel_width, + const T *kernel_y, size_t kernel_height, + SeparableFilterWorkspace *workspace) { + CHECK_POINTERS(workspace, kernel_x, kernel_y); if (kernel_width != 5 || kernel_height != 5) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; @@ -139,18 +137,38 @@ kleidicv_error_t separable_filter_2d_u8( return KLEIDICV_ERROR_CONTEXT_MISMATCH; } - Rectangle rect{width, height}; const Rectangle &context_rect = workspace->image_size(); if (context_rect.width() < width || context_rect.height() < height) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } + return KLEIDICV_OK; +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t separable_filter_2d_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint8_t *kernel_x, + size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = separable_filter_2d_checks( + src, src_stride, dst, dst_stride, width, height, channels, kernel_x, + kernel_width, kernel_y, kernel_height, workspace); + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + auto fixed_border_type = get_fixed_border_type(border_type); // if the std::optional is empty, that means that the border type is not // supported, so there's no need to check for specific types if (!fixed_border_type) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } + Rectangle rect{width, height}; + using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 1df5a7aed..1cf91c2ba 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -138,15 +138,13 @@ class SeparableFilter2D { SourceVectorType &kernel_y_4_u8_; }; // end of class SeparableFilter2D -static kleidicv_error_t separable_filter_2d_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height, size_t channels, const uint8_t *kernel_x, - size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, - kleidicv_border_type_t border_type, - kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { - CHECK_POINTERS(context, kernel_x, kernel_y); - auto *workspace = reinterpret_cast(context); - auto fixed_border_type = get_fixed_border_type(border_type); +template +static kleidicv_error_t separable_filter_2d_checks( + const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, + size_t height, size_t channels, const T *kernel_x, size_t kernel_width, + const T *kernel_y, size_t kernel_height, + SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + CHECK_POINTERS(workspace, kernel_x, kernel_y); if (kernel_width != 5 || kernel_height != 5) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; @@ -168,18 +166,38 @@ static kleidicv_error_t separable_filter_2d_u8_sc( return KLEIDICV_ERROR_CONTEXT_MISMATCH; } - Rectangle rect{width, height}; const Rectangle &context_rect = workspace->image_size(); if (context_rect.width() < width || context_rect.height() < height) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } + return KLEIDICV_OK; +} + +static kleidicv_error_t separable_filter_2d_u8_sc( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint8_t *kernel_x, + size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = separable_filter_2d_checks( + src, src_stride, dst, dst_stride, width, height, channels, kernel_x, + kernel_width, kernel_y, kernel_height, workspace); + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + auto fixed_border_type = get_fixed_border_type(border_type); // if the std::optional is empty, that means that the border type is not // supported, so there's no need to check for specific types if (!fixed_border_type) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } + Rectangle rect{width, height}; + using SeparableFilterClass = SeparableFilter2D; svuint16_t kernel_x_0_u16 = svdup_n_u16(kernel_x[0]); -- GitLab