diff --git a/conformity/opencv/test_separable_filter_2d.cpp b/conformity/opencv/test_separable_filter_2d.cpp index 3c544681ffe9a396cfb422492dc82d6d3f1c2582..548967eb19bf2d9592e195e2562e4b0fb2adc335 100644 --- a/conformity/opencv/test_separable_filter_2d.cpp +++ b/conformity/opencv/test_separable_filter_2d.cpp @@ -33,9 +33,9 @@ cv::Mat exec_separable_filter_2d(cv::Mat& input) { cv::Mat input_mat = input.rowRange(0, input.rows - 1).clone(); cv::RNG rng(kernel_seed); - cv::Mat kernel_x(KernelSize, 1, CV_8UC1); + cv::Mat kernel_x(KernelSize, 1, get_opencv_matrix_type()); rng.fill(kernel_x, cv::RNG::UNIFORM, 0, 5); - cv::Mat kernel_y(KernelSize, 1, CV_8UC1); + cv::Mat kernel_y(KernelSize, 1, get_opencv_matrix_type()); rng.fill(kernel_y, cv::RNG::UNIFORM, 0, 5); cv::Mat result; diff --git a/conformity/opencv/utils.h b/conformity/opencv/utils.h index b7ae8eaff3106aa50693cbe4b77ec37d7a6f6538..321a7e5901d37ab6dcb8d07fb55371bd75729dd8 100644 --- a/conformity/opencv/utils.h +++ b/conformity/opencv/utils.h @@ -11,6 +11,15 @@ #include "common.h" +template +constexpr int get_opencv_matrix_type() { + if constexpr (std::is_same_v) { + return CV_8UC(Channels); + } else if constexpr (std::is_same_v) { + return CV_16UC(Channels); + } +} + #if MANAGER template static auto abs_diff(T a, T b) { @@ -71,15 +80,6 @@ bool are_matrices_different(T threshold, cv::Mat& A, cv::Mat& B) { return false; } -template -constexpr int get_opencv_matrix_type() { - if constexpr (std::is_same_v) { - return CV_8UC(Channels); - } else if constexpr (std::is_same_v) { - return CV_16UC(Channels); - } -} - void fail_print_matrices(size_t height, size_t width, cv::Mat& input, cv::Mat& manager_result, cv::Mat& subord_result); diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index f8e7309dd6b03f866024117d97b3ff98c3a0b23b..8f047d0fcb40d560c4d61fc97d2ec812ae0df7e6 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -224,7 +224,7 @@ class GaussianBlur { // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * // * [ 2, 7, 14, 18, 14, 7, 2 ]T void vertical_scalar_path(const SourceType src[7], BufferType *dst) const { - uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + uint16_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + src[4] * 14 + src[5] * 7 + src[6] * 2; dst[0] = acc; } @@ -282,7 +282,7 @@ class GaussianBlur { DestinationType *dst) const { uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + src[4] * 14 + src[5] * 7 + src[6] * 2; - dst[0] = rounding_shift_right(acc, 12); + dst[0] = static_cast(rounding_shift_right(acc, 12)); } private: @@ -495,7 +495,7 @@ class GaussianBlur { acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + (src[4] + src[10]) * 81; acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; - dst[0] = rounding_shift_right(acc, 20); + dst[0] = static_cast(rounding_shift_right(acc, 20)); } private: @@ -569,18 +569,18 @@ class GaussianBlur { void vertical_scalar_path(const SourceType src[KernelSize], BufferType *dst) const { - uint32_t acc = static_cast(src[0]) * half_kernel_[0]; + BufferType acc = static_cast(src[0]) * half_kernel_[0]; // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i <= (KernelSize >> 1); i++) { - acc += static_cast(src[i]) * half_kernel_[i]; + acc += static_cast(src[i]) * half_kernel_[i]; } KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = (KernelSize >> 1) + 1; i < KernelSize; i++) { size_t j = KernelSize - i - 1; - acc += static_cast(src[i]) * half_kernel_[j]; + acc += static_cast(src[i]) * half_kernel_[j]; } dst[0] = acc; @@ -609,7 +609,7 @@ class GaussianBlur { void horizontal_scalar_path(const BufferType src[KernelSize], DestinationType *dst) const { - uint32_t acc = src[0] * half_kernel_[0]; + BufferType acc = src[0] * half_kernel_[0]; // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL @@ -623,7 +623,7 @@ class GaussianBlur { acc += src[i] * half_kernel_[j]; } - dst[0] = static_cast(rounding_shift_right(acc, 16)); + dst[0] = static_cast(rounding_shift_right(acc, 16)); } private: diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 7dfc6221badd52d15511f39c5a8ade985940afba..da1009549393927f999c7ef04125e4b3846a0549 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -35,16 +35,19 @@ class SeparableFilter2D { } void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const { - SourceVectorType acc_l = + BufferVectorType acc_l = vmull_u8(vget_low_u8(src[0]), vget_low_u8(kernel_y_u8_[0])); - SourceVectorType acc_h = vmull_high_u8(src[0], kernel_y_u8_[0]); + BufferVectorType acc_h = vmull_high_u8(src[0], kernel_y_u8_[0]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i < 5; i++) { - acc_l = - vmlal_u8(acc_l, vget_low_u8(src[i]), vget_low_u8(kernel_y_u8_[i])); - acc_h = vmlal_high_u8(acc_h, src[i], kernel_y_u8_[i]); + BufferVectorType vec_l = + vmull_u8(vget_low_u8(src[i]), vget_low_u8(kernel_y_u8_[i])); + BufferVectorType vec_h = vmull_high_u8(src[i], kernel_y_u8_[i]); + + acc_l = vqaddq_u16(acc_l, vec_l); + acc_h = vqaddq_u16(acc_h, vec_h); } vst1q_u16(&dst[0], acc_l); @@ -66,15 +69,20 @@ class SeparableFilter2D { void horizontal_vector_path(BufferVectorType src[5], DestinationType *dst) const { - BufferVectorType acc = vmulq_u16(src[0], kernel_x_u16_[0]); + uint32x4_t acc_l = + vmull_u16(vget_low_u16(src[0]), vget_low_u16(kernel_x_u16_[0])); + uint32x4_t acc_h = vmull_high_u16(src[0], kernel_x_u16_[0]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i < 5; i++) { - acc = vmlaq_u16(acc, src[i], kernel_x_u16_[i]); + acc_l = vmlal_u16(acc_l, vget_low_u16(src[i]), + vget_low_u16(kernel_x_u16_[i])); + acc_h = vmlal_high_u16(acc_h, src[i], kernel_x_u16_[i]); } - uint8x8_t result = vqmovn_u16(acc); + uint16x8_t acc_u16 = vcombine_u16(vqmovn_u32(acc_l), vqmovn_u32(acc_h)); + uint8x8_t result = vqmovn_u16(acc_u16); vst1_u8(&dst[0], result); } @@ -129,16 +137,19 @@ class SeparableFilter2D { } void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const { - SourceVectorType acc_l = + BufferVectorType acc_l = vmull_u16(vget_low_u16(src[0]), vget_low_u16(kernel_y_u16_[0])); - SourceVectorType acc_h = vmull_high_u16(src[0], kernel_y_u16_[0]); + BufferVectorType acc_h = vmull_high_u16(src[0], kernel_y_u16_[0]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i < 5; i++) { - acc_l = vmlal_u16(acc_l, vget_low_u16(src[i]), - vget_low_u16(kernel_y_u16_[i])); - acc_h = vmlal_high_u16(acc_h, src[i], kernel_y_u16_[i]); + BufferVectorType vec_l = + vmull_u16(vget_low_u16(src[i]), vget_low_u16(kernel_y_u16_[i])); + BufferVectorType vec_h = vmull_high_u16(src[i], kernel_y_u16_[i]); + + acc_l = vqaddq_u32(acc_l, vec_l); + acc_h = vqaddq_u32(acc_h, vec_h); } vst1q_u32(&dst[0], acc_l); @@ -160,15 +171,20 @@ class SeparableFilter2D { void horizontal_vector_path(BufferVectorType src[5], DestinationType *dst) const { - BufferVectorType acc = vmulq_u32(src[0], kernel_x_u32_[0]); + uint64x2_t acc_l = + vmull_u32(vget_low_u32(src[0]), vget_low_u32(kernel_x_u32_[0])); + uint64x2_t acc_h = vmull_high_u32(src[0], kernel_x_u32_[0]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i < 5; i++) { - acc = vmlaq_u32(acc, src[i], kernel_x_u32_[i]); + acc_l = vmlal_u32(acc_l, vget_low_u32(src[i]), + vget_low_u32(kernel_x_u32_[i])); + acc_h = vmlal_high_u32(acc_h, src[i], kernel_x_u32_[i]); } - uint16x4_t result = vqmovn_u32(acc); + uint32x4_t acc_u32 = vcombine_u32(vqmovn_u64(acc_l), vqmovn_u64(acc_h)); + uint16x4_t result = vqmovn_u32(acc_u32); vst1_u16(&dst[0], result); } diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index fca4b62d45606af1b77523fca5267984f560d3b5..1d00e92deeca70090353b9455741a40f1b8454e2 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -55,20 +55,28 @@ class SeparableFilter2D { BufferVectorType acc_t = svmullt_u16(src_0, kernel_y_0_u8_); // 1 - acc_b = svmlalb_u16(acc_b, src_1, kernel_y_1_u8_); - acc_t = svmlalt_u16(acc_t, src_1, kernel_y_1_u8_); + BufferVectorType vec_b = svmullb_u16(src_1, kernel_y_1_u8_); + BufferVectorType vec_t = svmullt_u16(src_1, kernel_y_1_u8_); + acc_b = svqadd_u16_x(pg, acc_b, vec_b); + acc_t = svqadd_u16_x(pg, acc_t, vec_t); // 2 - acc_b = svmlalb_u16(acc_b, src_2, kernel_y_2_u8_); - acc_t = svmlalt_u16(acc_t, src_2, kernel_y_2_u8_); + vec_b = svmullb_u16(src_2, kernel_y_2_u8_); + vec_t = svmullt_u16(src_2, kernel_y_2_u8_); + acc_b = svqadd_u16_x(pg, acc_b, vec_b); + acc_t = svqadd_u16_x(pg, acc_t, vec_t); // 3 - acc_b = svmlalb_u16(acc_b, src_3, kernel_y_3_u8_); - acc_t = svmlalt_u16(acc_t, src_3, kernel_y_3_u8_); + vec_b = svmullb_u16(src_3, kernel_y_3_u8_); + vec_t = svmullt_u16(src_3, kernel_y_3_u8_); + acc_b = svqadd_u16_x(pg, acc_b, vec_b); + acc_t = svqadd_u16_x(pg, acc_t, vec_t); // 4 - acc_b = svmlalb_u16(acc_b, src_4, kernel_y_4_u8_); - acc_t = svmlalt_u16(acc_t, src_4, kernel_y_4_u8_); + vec_b = svmullb_u16(src_4, kernel_y_4_u8_); + vec_t = svmullt_u16(src_4, kernel_y_4_u8_); + acc_b = svqadd_u16_x(pg, acc_b, vec_b); + acc_t = svqadd_u16_x(pg, acc_t, vec_t); BufferDoubleVectorType interleaved = svcreate2_u16(acc_b, acc_t); svst2(pg, &dst[0], interleaved); @@ -79,24 +87,34 @@ class SeparableFilter2D { BufferVectorType src_2, BufferVectorType src_3, BufferVectorType src_4, DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - BufferVectorType acc = svmul_u16_x(pg, src_0, kernel_x_0_u16_); + svuint32_t acc_b = svmullb_u32(src_0, kernel_x_0_u16_); + svuint32_t acc_t = svmullt_u32(src_0, kernel_x_0_u16_); // 1 - acc = svmla_u16_x(pg, acc, src_1, kernel_x_1_u16_); + acc_b = svmlalb_u32(acc_b, src_1, kernel_x_1_u16_); + acc_t = svmlalt_u32(acc_t, src_1, kernel_x_1_u16_); // 2 - acc = svmla_u16_x(pg, acc, src_2, kernel_x_2_u16_); + acc_b = svmlalb_u32(acc_b, src_2, kernel_x_2_u16_); + acc_t = svmlalt_u32(acc_t, src_2, kernel_x_2_u16_); // 3 - acc = svmla_u16_x(pg, acc, src_3, kernel_x_3_u16_); + acc_b = svmlalb_u32(acc_b, src_3, kernel_x_3_u16_); + acc_t = svmlalt_u32(acc_t, src_3, kernel_x_3_u16_); // 4 - acc = svmla_u16_x(pg, acc, src_4, kernel_x_4_u16_); + acc_b = svmlalb_u32(acc_b, src_4, kernel_x_4_u16_); + acc_t = svmlalt_u32(acc_t, src_4, kernel_x_4_u16_); + + svuint16_t acc_u16_b = svqxtnb_u32(acc_b); + svuint16_t acc_u16 = svqxtnt_u32(acc_u16_b, acc_t); svbool_t greater = - svcmpgt_n_u16(pg, acc, std::numeric_limits::max()); - acc = svdup_n_u16_m(acc, greater, std::numeric_limits::max()); - svst1b_u16(pg, &dst[0], acc); + svcmpgt_n_u16(pg, acc_u16, std::numeric_limits::max()); + acc_u16 = + svdup_n_u16_m(acc_u16, greater, std::numeric_limits::max()); + + svst1b_u16(pg, &dst[0], acc_u16); } void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) @@ -177,20 +195,28 @@ class SeparableFilter2D { BufferVectorType acc_t = svmullt_u32(src_0, kernel_y_0_u16_); // 1 - acc_b = svmlalb_u32(acc_b, src_1, kernel_y_1_u16_); - acc_t = svmlalt_u32(acc_t, src_1, kernel_y_1_u16_); + BufferVectorType vec_b = svmullb_u32(src_1, kernel_y_1_u16_); + BufferVectorType vec_t = svmullt_u32(src_1, kernel_y_1_u16_); + acc_b = svqadd_u32_x(pg, acc_b, vec_b); + acc_t = svqadd_u32_x(pg, acc_t, vec_t); // 2 - acc_b = svmlalb_u32(acc_b, src_2, kernel_y_2_u16_); - acc_t = svmlalt_u32(acc_t, src_2, kernel_y_2_u16_); + vec_b = svmullb_u32(src_2, kernel_y_2_u16_); + vec_t = svmullt_u32(src_2, kernel_y_2_u16_); + acc_b = svqadd_u32_x(pg, acc_b, vec_b); + acc_t = svqadd_u32_x(pg, acc_t, vec_t); // 3 - acc_b = svmlalb_u32(acc_b, src_3, kernel_y_3_u16_); - acc_t = svmlalt_u32(acc_t, src_3, kernel_y_3_u16_); + vec_b = svmullb_u32(src_3, kernel_y_3_u16_); + vec_t = svmullt_u32(src_3, kernel_y_3_u16_); + acc_b = svqadd_u32_x(pg, acc_b, vec_b); + acc_t = svqadd_u32_x(pg, acc_t, vec_t); // 4 - acc_b = svmlalb_u32(acc_b, src_4, kernel_y_4_u16_); - acc_t = svmlalt_u32(acc_t, src_4, kernel_y_4_u16_); + vec_b = svmullb_u32(src_4, kernel_y_4_u16_); + vec_t = svmullt_u32(src_4, kernel_y_4_u16_); + acc_b = svqadd_u32_x(pg, acc_b, vec_b); + acc_t = svqadd_u32_x(pg, acc_t, vec_t); BufferDoubleVectorType interleaved = svcreate2_u32(acc_b, acc_t); svst2(pg, &dst[0], interleaved); @@ -201,24 +227,34 @@ class SeparableFilter2D { BufferVectorType src_2, BufferVectorType src_3, BufferVectorType src_4, DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - BufferVectorType acc = svmul_u32_x(pg, src_0, kernel_x_0_u32_); + svuint64_t acc_b = svmullb_u64(src_0, kernel_x_0_u32_); + svuint64_t acc_t = svmullt_u64(src_0, kernel_x_0_u32_); // 1 - acc = svmla_u32_x(pg, acc, src_1, kernel_x_1_u32_); + acc_b = svmlalb_u64(acc_b, src_1, kernel_x_1_u32_); + acc_t = svmlalt_u64(acc_t, src_1, kernel_x_1_u32_); // 2 - acc = svmla_u32_x(pg, acc, src_2, kernel_x_2_u32_); + acc_b = svmlalb_u64(acc_b, src_2, kernel_x_2_u32_); + acc_t = svmlalt_u64(acc_t, src_2, kernel_x_2_u32_); // 3 - acc = svmla_u32_x(pg, acc, src_3, kernel_x_3_u32_); + acc_b = svmlalb_u64(acc_b, src_3, kernel_x_3_u32_); + acc_t = svmlalt_u64(acc_t, src_3, kernel_x_3_u32_); // 4 - acc = svmla_u32_x(pg, acc, src_4, kernel_x_4_u32_); + acc_b = svmlalb_u64(acc_b, src_4, kernel_x_4_u32_); + acc_t = svmlalt_u64(acc_t, src_4, kernel_x_4_u32_); + + svuint32_t acc_u32_b = svqxtnb_u64(acc_b); + svuint32_t acc_u32 = svqxtnt_u64(acc_u32_b, acc_t); svbool_t greater = - svcmpgt_n_u32(pg, acc, std::numeric_limits::max()); - acc = svdup_n_u32_m(acc, greater, std::numeric_limits::max()); - svst1h_u32(pg, &dst[0], acc); + svcmpgt_n_u32(pg, acc_u32, std::numeric_limits::max()); + acc_u32 = + svdup_n_u32_m(acc_u32, greater, std::numeric_limits::max()); + + svst1h_u32(pg, &dst[0], acc_u32); } void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) diff --git a/test/api/test_separable_filter_2d.cpp b/test/api/test_separable_filter_2d.cpp index de2f7e19acc3c88eac550afe987d057e1ac23130..16ad58f91a5b55270bf83c61d5df0436d269f855 100644 --- a/test/api/test_separable_filter_2d.cpp +++ b/test/api/test_separable_filter_2d.cpp @@ -158,7 +158,7 @@ TYPED_TEST(SeparableFilter2D, 5x5) { .test(mask, 5); } -TEST(SeparableFilter2D, 5x5_U8Overflow) { +TEST(SeparableFilter2D, 5x5_U8OverflowSequence) { using TypeParam = uint8_t; kleidicv_filter_context_t *context = nullptr; @@ -178,12 +178,31 @@ TEST(SeparableFilter2D, 5x5_U8Overflow) { test::Array2D kernel_y{5, 1}; kernel_y.set(0, 0, {5, 6, 7, 8, 9}); + test::Array2D dst_expected{5, 5, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 255, 255, 255, 255, 255}); + dst_expected.set(1, 0, { 255, 255, 255, 255, 255}); + dst_expected.set(2, 0, { 255, 255, 255, 255, 255}); + dst_expected.set(3, 0, { 255, 255, 255, 255, 255}); + dst_expected.set(4, 0, { 255, 255, 255, 255, 255}); + // clang-format on + test::Array2D dst{5, 5, test::Options::vector_length()}; EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( src.data(), src.stride(), dst.data(), dst.stride(), 5, 5, 1, kernel_x.data(), 5, kernel_y.data(), 5, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + +TEST(SeparableFilter2D, 5x5_U8OverflowMax) { + using TypeParam = uint8_t; + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 5, 5, 5, 5)); + test::Array2D src{5, 5, test::Options::vector_length()}; // clang-format off src.set(0, 0, { 255, 255, 255, 255, 255}); src.set(1, 0, { 255, 255, 255, 255, 255}); @@ -192,6 +211,11 @@ TEST(SeparableFilter2D, 5x5_U8Overflow) { src.set(4, 0, { 255, 255, 255, 255, 255}); // clang-format on + test::Array2D kernel_x{5, 1}; + kernel_x.set(0, 0, {255, 255, 255, 255, 255}); + test::Array2D kernel_y{5, 1}; + kernel_y.set(0, 0, {255, 255, 255, 255, 255}); + test::Array2D dst_expected{5, 5, test::Options::vector_length()}; // clang-format off dst_expected.set(0, 0, { 255, 255, 255, 255, 255}); @@ -200,20 +224,97 @@ TEST(SeparableFilter2D, 5x5_U8Overflow) { dst_expected.set(3, 0, { 255, 255, 255, 255, 255}); dst_expected.set(4, 0, { 255, 255, 255, 255, 255}); // clang-format on + + test::Array2D dst{5, 5, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 5, 5, 1, kernel_x.data(), 5, kernel_y.data(), 5, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); EXPECT_EQ_ARRAY2D(dst_expected, dst); +} - kernel_x.set(0, 0, {255, 255, 255, 255, 255}); - kernel_y.set(0, 0, {255, 255, 255, 255, 255}); +TEST(SeparableFilter2D, 5x5_U8OverflowVectorNEON) { + using TypeParam = uint8_t; + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 5, 5, 13, 6)); + test::Array2D src{13, 6, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 232, 175, 8, 66, 167, 249, 190, 176, 89, 230, 120, 71, 14}); + src.set(1, 0, { 222, 254, 230, 253, 64, 127, 144, 43, 172, 110, 22, 232, 233}); + src.set(2, 0, { 106, 40, 40, 59, 18, 204, 247, 252, 179, 69, 163, 190, 58}); + src.set(3, 0, { 213, 22, 107, 111, 233, 10, 51, 17, 35, 14, 197, 157, 237}); + src.set(4, 0, { 96, 180, 160, 185, 146, 15, 103, 62, 227, 180, 249, 82, 83}); + src.set(5, 0, { 167, 150, 176, 149, 65, 246, 237, 234, 138, 51, 159, 218, 245}); + // clang-format on + + test::Array2D kernel_x{5, 1}; + kernel_x.set(0, 0, {23, 149, 238, 48, 224}); + test::Array2D kernel_y{5, 1}; + kernel_y.set(0, 0, {96, 254, 32, 81, 7}); + + test::Array2D dst_expected{13, 6, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(1, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(2, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(3, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(4, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(5, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + // clang-format on + + test::Array2D dst{13, 6, test::Options::vector_length()}; EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( src.data(), src.stride(), dst.data(), dst.stride(), - 5, 5, 1, kernel_x.data(), 5, kernel_y.data(), 5, + 13, 6, 1, kernel_x.data(), 5, kernel_y.data(), 5, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); EXPECT_EQ_ARRAY2D(dst_expected, dst); } -TEST(SeparableFilter2D, 5x5_U16Overflow) { +TEST(SeparableFilter2D, 5x5_U8OverflowVectorSC) { + using TypeParam = uint8_t; + + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 5, 5, 13, 6)); + test::Array2D src{13, 6, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 133, 210, 177, 6, 5, 200, 6, 242, 237, 80, 223, 253, 241}); + src.set(1, 0, { 112, 148, 209, 186, 188, 202, 18, 215, 193, 109, 226, 154, 207}); + src.set(2, 0, { 95, 216, 99, 161, 209, 183, 45, 226, 116, 210, 183, 11, 190}); + src.set(3, 0, { 237, 170, 10, 80, 207, 52, 69, 119, 68, 16, 239, 103, 25}); + src.set(4, 0, { 249, 106, 195, 207, 18, 123, 244, 63, 183, 13, 52, 196, 106}); + src.set(5, 0, { 66, 17, 191, 246, 246, 166, 137, 102, 84, 239, 245, 199, 144}); + // clang-format on + + test::Array2D kernel_x{5, 1}; + kernel_x.set(0, 0, {99, 31, 197, 141, 71}); + test::Array2D kernel_y{5, 1}; + kernel_y.set(0, 0, {60, 231, 86, 4, 140}); + + test::Array2D dst_expected{13, 6, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(1, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(2, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(3, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(4, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + dst_expected.set(5, 0, { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}); + // clang-format on + + test::Array2D dst{13, 6, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 13, 6, 1, kernel_x.data(), 5, kernel_y.data(), 5, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + +TEST(SeparableFilter2D, 5x5_U16OverflowSequence) { using TypeParam = uint16_t; kleidicv_filter_context_t *context = nullptr; @@ -236,13 +337,7 @@ TEST(SeparableFilter2D, 5x5_U16Overflow) { test::Array2D kernel_y{5, 1}; kernel_y.set(0, 0, {38, 0, 38, 0, 38}); - test::Array2D dst{7, 8, test::Options::vector_length()}; - EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( - src.data(), src.stride(), dst.data(), dst.stride(), - 7, 8, 1, kernel_x.data(), 5, kernel_y.data(), 5, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - - test::Array2D dst_expected{7, 8, test::Options::vector_length()}; + test::Array2D dst_expected{7, 8, test::Options::vector_length()}; // clang-format off dst_expected.set(0, 0, { 30324, 38988, 47652, 60648, 65535, 65535, 65535}); dst_expected.set(1, 0, { 38988, 47652, 56316, 65535, 65535, 65535, 65535}); @@ -253,11 +348,40 @@ TEST(SeparableFilter2D, 5x5_U16Overflow) { dst_expected.set(6, 0, { 65535, 65535, 65535, 60648, 60648, 43320, 51984}); dst_expected.set(7, 0, { 65535, 65535, 56316, 65535, 43320, 51984, 47652}); // clang-format on + + test::Array2D dst{7, 8, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 7, 8, 1, kernel_x.data(), 5, kernel_y.data(), 5, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + +TEST(SeparableFilter2D, 5x5_U16OverflowBigKernel) { + using TypeParam = uint16_t; + + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 5, 5, 7, 8)); + test::Array2D src{7, 8, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 1, 2, 3, 4, 5, 6, 7}); + src.set(1, 0, { 2, 3, 4, 5, 6, 7, 8}); + src.set(2, 0, { 3, 4, 5, 6, 7, 8, 9}); + src.set(3, 0, { 4, 5, 6, 7, 8, 9, 1}); + src.set(4, 0, { 5, 6, 7, 8, 9, 1, 2}); + src.set(5, 0, { 6, 7, 8, 9, 1, 2, 3}); + src.set(6, 0, { 7, 8, 9, 1, 2, 3, 4}); + src.set(7, 0, { 8, 9, 1, 2, 3, 4, 5}); + // clang-format on + test::Array2D kernel_x{5, 1}; kernel_x.set(0, 0, {83, 94, 83, 94, 83}); + test::Array2D kernel_y{5, 1}; kernel_y.set(0, 0, {94, 83, 94, 83, 94}); + test::Array2D dst_expected{7, 8, test::Options::vector_length()}; // clang-format off dst_expected.set(0, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); dst_expected.set(1, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); @@ -269,12 +393,22 @@ TEST(SeparableFilter2D, 5x5_U16Overflow) { dst_expected.set(7, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); // clang-format on + test::Array2D dst{7, 8, test::Options::vector_length()}; EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( src.data(), src.stride(), dst.data(), dst.stride(), 7, 8, 1, kernel_x.data(), 5, kernel_y.data(), 5, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); EXPECT_EQ_ARRAY2D(dst_expected, dst); +} +TEST(SeparableFilter2D, 5x5_U16OverflowMax) { + using TypeParam = uint16_t; + + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 5, 5, 7, 8)); + test::Array2D src{7, 8, test::Options::vector_length()}; // clang-format off src.set(0, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); src.set(1, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); @@ -286,9 +420,24 @@ TEST(SeparableFilter2D, 5x5_U16Overflow) { src.set(7, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); // clang-format on + test::Array2D kernel_x{5, 1}; kernel_x.set(0, 0, {65535, 65535, 65535, 65535, 65535}); + test::Array2D kernel_y{5, 1}; kernel_y.set(0, 0, {65535, 65535, 65535, 65535, 65535}); + test::Array2D dst_expected{7, 8, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(1, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(2, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(3, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(4, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(5, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(6, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(7, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + // clang-format on + + test::Array2D dst{7, 8, test::Options::vector_length()}; EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( src.data(), src.stride(), dst.data(), dst.stride(), 7, 8, 1, kernel_x.data(), 5, kernel_y.data(), 5, @@ -297,6 +446,62 @@ TEST(SeparableFilter2D, 5x5_U16Overflow) { EXPECT_EQ_ARRAY2D(dst_expected, dst); } +TEST(SeparableFilter2D, 5x5_U16OverflowVector) { + using TypeParam = uint16_t; + + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 5, 5, 18, 7)); + test::Array2D src{18, 7, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 7069, 15555, 36257, 50924, 19919, 14775, 5812, 63033, + 12337, 31198, 64955, 38064, 52102, 33736, 44794, 28036, 28418, 51544}); + src.set(1, 0, { 56176, 39501, 12937, 60165, 41073, 42249, 26998, 8958, + 17167, 567, 49467, 56007, 9385, 49384, 52038, 3262, 42863, 57617}); + src.set(2, 0, { 53432, 9693, 54092, 741, 835, 61755, 3707, 3429, + 20223, 65475, 42973, 9837, 41947, 41431, 53538, 2774, 50094, 65193}); + src.set(3, 0, { 2673, 45570, 2199, 38120, 55556, 7612, 53485, 44718, + 16967, 60551, 63543, 55699, 45352, 58886, 52300, 36045, 16187, 6794}); + src.set(4, 0, { 50260, 62222, 30989, 44610, 41729, 64829, 48408, 62415, + 20341, 13347, 26792, 9543, 45732, 3551, 43217, 41365, 4666, 41742}); + src.set(5, 0, { 55105, 31681, 64645, 51293, 43515, 8779, 43396, 12372, + 37819, 61444, 10427, 49746, 12989, 58916, 27310, 46273, 60514, 59064}); + src.set(6, 0, { 40983, 23334, 50325, 15939, 50201, 54234, 2318, 5649, + 32631, 44612, 49516, 36557, 20168, 17045, 40077, 60173, 61168, 3247}); + // clang-format on + + test::Array2D kernel_x{5, 1}; + kernel_x.set(0, 0, {60064, 6000, 11871, 49673, 48017}); + test::Array2D kernel_y{5, 1}; + kernel_y.set(0, 0, {8956, 29661, 59112, 41299, 41083}); + + test::Array2D dst_expected{18, 7, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(1, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(2, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(3, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(4, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(5, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + dst_expected.set(6, 0, { 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, + 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535}); + // clang-format on + + test::Array2D dst{18, 7, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 18, 7, 1, kernel_x.data(), 5, kernel_y.data(), 5, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + TYPED_TEST(SeparableFilter2D, NullPointer) { using KernelTestParams = SeparableFilter2DKernelTestParams; kleidicv_filter_context_t *context = nullptr;