From 9dd8f83f107ef653ddac59486b444a76ebde5950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 30 May 2024 18:28:48 +0200 Subject: [PATCH 1/8] Fix capitalization in filter include guards --- kleidicv/include/kleidicv/separable_filter_5x5_sc.h | 6 +++--- kleidicv/include/kleidicv/separable_filter_7x7_sc.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h index 2115c1ed0..54f8c1aa6 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SEPARABLE_FILTER_5x5_SC_H -#define KLEIDICV_SEPARABLE_FILTER_5x5_SC_H +#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_SC_H +#define KLEIDICV_SEPARABLE_FILTER_5X5_SC_H #include "kleidicv/sve2.h" #include "kleidicv/workspace/border_5x5.h" @@ -178,4 +178,4 @@ using SeparableFilter5x5 = SeparableFilter; } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SEPARABLE_FILTER_5x5_SC_H +#endif // KLEIDICV_SEPARABLE_FILTER_5X5_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h index a19e01e3a..630340856 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SEPARABLE_FILTER_7x7_SC_H -#define KLEIDICV_SEPARABLE_FILTER_7x7_SC_H +#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_SC_H +#define KLEIDICV_SEPARABLE_FILTER_7X7_SC_H #include "kleidicv/sve2.h" #include "kleidicv/workspace/border_7x7.h" @@ -194,4 +194,4 @@ using SeparableFilter7x7 = SeparableFilter; } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SEPARABLE_FILTER_7x7_SC_H +#endif // KLEIDICV_SEPARABLE_FILTER_7X7_SC_H -- GitLab From 09bb6ebe222f2c979b57765120bd942ff557f99e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 29 May 2024 13:53:00 +0200 Subject: [PATCH 2/8] Add implementation for NEON 15x15 Gaussian blur --- adapters/opencv/kleidicv_hal.cpp | 9 +- conformity/opencv/test_gaussian_blur.cpp | 46 ++- .../include/kleidicv/filters/gaussian_blur.h | 21 ++ kleidicv/include/kleidicv/kleidicv.h | 20 ++ .../kleidicv/separable_filter_15x15_neon.h | 211 +++++++++++++ .../include/kleidicv/workspace/border_15x15.h | 276 ++++++++++++++++++ kleidicv/src/filters/gaussian_blur_api.cpp | 5 + kleidicv/src/filters/gaussian_blur_neon.cpp | 241 ++++++++++++++- kleidicv/src/filters/gaussian_blur_sme2.cpp | 7 + kleidicv/src/filters/gaussian_blur_sve2.cpp | 8 + test/api/test_gaussian_blur.cpp | 205 ++++++++++++- 11 files changed, 1037 insertions(+), 12 deletions(-) create mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_neon.h create mode 100644 kleidicv/include/kleidicv/workspace/border_15x15.h diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 23087b731..4a65e5e20 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -272,6 +272,8 @@ int gaussian_blur_binomial(const uchar *src_data, size_t src_step, impl = kleidicv_gaussian_blur_5x5_u8; } else if ((kernel_size == 7) && (width >= 7) && (height >= 7)) { impl = kleidicv_gaussian_blur_7x7_u8; + } else if ((kernel_size == 15) && (width >= 15) && (height >= 15)) { + impl = kleidicv_gaussian_blur_15x15_u8; } else { return CV_HAL_ERROR_NOT_IMPLEMENTED; } @@ -281,13 +283,16 @@ int gaussian_blur_binomial(const uchar *src_data, size_t src_step, if (type_size == SIZE_MAX) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } - type_size *= 2; /* widening */ + + // widening + size_t intermediate_size = + (kernel_size == 15) ? 4 * type_size : 2 * type_size; kleidicv_rectangle_t image = { .width = static_cast(width), .height = static_cast(height)}; if (kleidicv_error_t create_err = - kleidicv_filter_create(&context, cn, type_size, image)) { + kleidicv_filter_create(&context, cn, intermediate_size, image)) { return convert_error(create_err); } diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index f56f4a933..1191488d5 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -20,17 +20,33 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { cv::RNG rng(0); - for (size_t x = 5; x <= 16; ++x) { - for (size_t y = 5; y <= 16; ++y) { - cv::Mat input(x, y, CV_8UC(Channels)); + size_t size_min = 5; + size_t size_max = 16; + if constexpr (KernelSize == 15) { + size_min = 14; + size_max = 32; + } + + for (size_t y = size_min; y <= size_max; ++y) { + for (size_t x = size_min; x <= size_max; ++x) { + cv::Mat input(y, x, CV_8UC(Channels)); rng.fill(input, cv::RNG::UNIFORM, 0, 255); cv::Mat actual = exec_gaussian_blur(input); cv::Mat expected = get_expected_from_subordinate(index, request_queue, reply_queue, input); - if (are_matrices_different(0, actual, expected)) { - fail_print_matrices(x, y, input, actual, expected); + uint8_t threshold = 0; + // There are currently rounding differences sometimes + // between the OpenCV and KleidiCV implementations that use + // the 15x15 kernel size, so we ignore any non-matching + // values that fall within the specified threshold. + if constexpr (KernelSize == 15) { + threshold = 2; + } + + if (are_matrices_different(threshold, actual, expected)) { + fail_print_matrices(y, x, input, actual, expected); return true; } } @@ -102,6 +118,26 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 7x7, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 7x7, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 7x7, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), + + TEST("Gaussian blur 15x15, BORDER_REFLECT, 1 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 15x15, BORDER_REFLECT, 2 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 15x15, BORDER_REFLECT, 3 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 15x15, BORDER_REFLECT, 4 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), + + TEST("Gaussian blur 15x15, BORDER_WRAP, 1 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), + TEST("Gaussian blur 15x15, BORDER_WRAP, 2 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), + TEST("Gaussian blur 15x15, BORDER_WRAP, 3 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), + TEST("Gaussian blur 15x15, BORDER_WRAP, 4 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), + + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), }; // clang-format on return tests; diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index 769c5480f..d14f75e43 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -33,6 +33,13 @@ kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace neon namespace sve2 { @@ -58,6 +65,13 @@ kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace sve2 namespace sme2 { @@ -83,6 +97,13 @@ kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace sme2 } // namespace kleidicv diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 134ca5ed8..b74a3aa2f 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1214,6 +1214,18 @@ kleidicv_error_t kleidicv_filter_release(kleidicv_filter_context_t *context); /// [ 14, 49, 98, 126, 98, 49, 14 ] /// [ 4, 14, 28, 36, 28, 14, 4 ] /// ``` +/// 15x15 Gaussian Blur filter for uint8_t types: +/// ``` +/// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] +/// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] +/// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] +/// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] +/// 1/1048576 * [ | | | | ... | | | | ] +/// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] +/// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] +/// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] +/// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] +/// ``` /// /// Width and height are the same for the source and for the destination. Number /// of elements is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. @@ -1270,6 +1282,14 @@ KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_7x7_u8, const uint8_t *src, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +/// @copydoc kleidicv_gaussian_blur_3x3_u8 +/// +KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_15x15_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + /// Splits a multi channel source stream into separate 1-channel streams. Width /// and height are the same for the source stream and for all the destination /// streams. Number of pixels is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h new file mode 100644 index 000000000..425769eee --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_15x15.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 15x15 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{7UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[15]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); + src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]); + src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]); + src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]); + src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]); + src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]); + src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]); + src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]); + src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[15]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + src[5] = src_rows.at(border_offsets.c5())[index]; + src[6] = src_rows.at(border_offsets.c6())[index]; + src[7] = src_rows.at(border_offsets.c7())[index]; + src[8] = src_rows.at(border_offsets.c8())[index]; + src[9] = src_rows.at(border_offsets.c9())[index]; + src[10] = src_rows.at(border_offsets.c10())[index]; + src[11] = src_rows.at(border_offsets.c11())[index]; + src[12] = src_rows.at(border_offsets.c12())[index]; + src[13] = src_rows.at(border_offsets.c13())[index]; + src[14] = src_rows.at(border_offsets.c14())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; + auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; + auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; + auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; + auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; + auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; + auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; + auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; + + BufferVectorType src_a[15], src_b[15]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_5[0]); + src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_6[0]); + src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); + src_a[7] = vld1q(&src_7[0]); + src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]); + src_a[8] = vld1q(&src_8[0]); + src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]); + src_a[9] = vld1q(&src_9[0]); + src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]); + src_a[10] = vld1q(&src_10[0]); + src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]); + src_a[11] = vld1q(&src_11[0]); + src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]); + src_a[12] = vld1q(&src_12[0]); + src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]); + src_a[13] = vld1q(&src_13[0]); + src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]); + src_a[14] = vld1q(&src_14[0]); + src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[15]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); + src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]); + src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]); + src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]); + src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]); + src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]); + src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]); + src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]); + src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[15]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[7] = src_rows.at(0, border_offsets.c7())[index]; + src[8] = src_rows.at(0, border_offsets.c8())[index]; + src[9] = src_rows.at(0, border_offsets.c9())[index]; + src[10] = src_rows.at(0, border_offsets.c10())[index]; + src[11] = src_rows.at(0, border_offsets.c11())[index]; + src[12] = src_rows.at(0, border_offsets.c12())[index]; + src[13] = src_rows.at(0, border_offsets.c13())[index]; + src[14] = src_rows.at(0, border_offsets.c14())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 15x15 separable filters driver type. +template +using SeparableFilter15x15 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H diff --git a/kleidicv/include/kleidicv/workspace/border_15x15.h b/kleidicv/include/kleidicv/workspace/border_15x15.h new file mode 100644 index 000000000..eb3ae12ad --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_15x15.h @@ -0,0 +1,276 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_15X15_H +#define KLEIDICV_WORKSPACE_BORDER_15X15_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 15x15 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6, size_t o7, size_t o8, size_t o9, size_t o10, size_t o11, + size_t o12, size_t o13, size_t o14) + : offsets_{o0, o1, o2, o3, o4, o5, o6, o7, + o8, o9, o10, o11, o12, o13, o14} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + size_t c5() const { return offsets_[5]; } + size_t c6() const { return offsets_[6]; } + size_t c7() const { return offsets_[7]; } + size_t c8() const { return offsets_[8]; } + size_t c9() const { return offsets_[9]; } + size_t c10() const { return offsets_[10]; } + size_t c11() const { return offsets_[11]; } + size_t c12() const { return offsets_[12]; } + size_t c13() const { return offsets_[13]; } + size_t c14() const { return offsets_[14]; } + + private: + size_t offsets_[15]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + + // NOLINTBEGIN(readability-function-cognitive-complexity) + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 1) { + return get(-1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(-2, -2, -2, -2, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(-3, -3, -3, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(-4, -4, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(-5, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return get(-6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 1) { + return get(4, 3, 2, 1, 0, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(2, 1, 0, -1, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(0, -1, -2, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(-2, -3, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(-4, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return get(-6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, + height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3, 4, 5, 6, + 7); + } else if (column_index == 1) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, + height_ - 3, height_ - 2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, + height_ - 3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, -3, -2, + -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(height_ - 7, height_ - 6, height_ - 5, -4, -3, -2, -1, 0, + 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(height_ - 7, height_ - 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, + 4, 5, 6, 7); + } else { + return get(height_ - 7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 1) { + return get(5, 4, 3, 2, 1, 0, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(3, 2, 1, 0, -1, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(1, 0, -1, -2, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(-1, -2, -3, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(-3, -4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return get(-5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 6); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 5, 5); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 4, 4); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 3, 3, 3); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 2, 2, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 1, 1, 1, 1, 1); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 6); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 5, 4); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 3, 2); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 2, 1, 0); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 1, 0, -1, -2); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 0, -1, -2, -3, -4); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 0, -1, -2, -3, -4, -5, -6); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7 - height_); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6 - height_, + 7 - height_); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 - height_, + 6 - height_, 7 - height_); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 - height_, + 5 - height_, 6 - height_, 7 - height_); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3 - height_, + 4 - height_, 5 - height_, 6 - height_, 7 - height_); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2 - height_, 3 - height_, + 4 - height_, 5 - height_, 6 - height_, 7 - height_); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1 - height_, 2 - height_, + 3 - height_, 4 - height_, 5 - height_, 6 - height_, + 7 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 5); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 4, 3); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 3, 2, 1); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 2, 1, 0, -1); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 1, 0, -1, -2, -3); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 0, -1, -2, -3, -4, -5); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5, -6, -7); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + // NOLINTEND(readability-function-cognitive-complexity) + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 6U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 7U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6, size_t o7, size_t o8, size_t o9, size_t o10, + size_t o11, size_t o12, size_t o13, + size_t o14) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6, o7, + o8, o9, o10, o11, o12, o13, o14}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 15x15 filter border type. +template +using FixedBorderInfo15x15 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_15X15_H diff --git a/kleidicv/src/filters/gaussian_blur_api.cpp b/kleidicv/src/filters/gaussian_blur_api.cpp index f42da7913..6732e1d29 100644 --- a/kleidicv/src/filters/gaussian_blur_api.cpp +++ b/kleidicv/src/filters/gaussian_blur_api.cpp @@ -66,3 +66,8 @@ KLEIDICV_MULTIVERSION_C_API( kleidicv_gaussian_blur_7x7_u8, &kleidicv::neon::gaussian_blur_7x7_u8, KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_7x7_u8), &kleidicv::sme2::gaussian_blur_7x7_u8); + +KLEIDICV_MULTIVERSION_C_API( + kleidicv_gaussian_blur_15x15_u8, &kleidicv::neon::gaussian_blur_15x15_u8, + KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_15x15_u8), + &kleidicv::sme2::gaussian_blur_15x15_u8); diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index fe451b42a..e54bb4665 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -5,6 +5,7 @@ #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_15x15_neon.h" #include "kleidicv/separable_filter_3x3_neon.h" #include "kleidicv/separable_filter_5x5_neon.h" #include "kleidicv/separable_filter_7x7_neon.h" @@ -284,6 +285,228 @@ class DiscreteGaussianBlur { uint16x8_t const_9_u16_; }; // end of class DiscreteGaussianBlur +// Template for 15x15 Gaussian Blur approximation filters. +// +// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] +// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] +// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] +// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] +// F = 1/1048576 * [ | | | | ... | | | | ] = +// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] +// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] +// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] +// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] +// +// [ 4 ] +// [ 11 ] +// [ 25 ] +// [ 48 ] +// [ 81 ] +// [ 118 ] +// [ 146 ] +// = 1/1048576 * [ 158 ] * [4,11,25,48,81,118,146,158,146,118,81,48,25,11,4] +// [ 146 ] +// [ 118 ] +// [ 81 ] +// [ 48 ] +// [ 25 ] +// [ 11 ] +// [ 4 ] +template <> +class DiscreteGaussianBlur { + public: + using SourceType = uint8_t; + using BufferType = uint32_t; + using DestinationType = uint8_t; + + DiscreteGaussianBlur() + : const_11_u16_{vdupq_n_u16(11)}, + const_11_u32_{vdupq_n_u32(11)}, + const_25_u16_{vdupq_n_u16(25)}, + const_25_u32_{vdupq_n_u32(25)}, + const_81_u16_{vdupq_n_u16(81)}, + const_81_u32_{vdupq_n_u32(81)}, + const_118_u16_half_{vdup_n_u16(118)}, + const_118_u32_{vdupq_n_u32(118)}, + const_146_u16_half_{vdup_n_u16(146)}, + const_146_u32_{vdupq_n_u32(146)}, + const_158_u16_half_{vdup_n_u16(158)}, + const_158_u32_{vdupq_n_u32(158)} {} + + // Applies vertical filtering vector using SIMD operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void vertical_vector_path(uint8x16_t src[15], BufferType *dst) const { + uint16x8_t acc_7_l = vmovl_u8(vget_low_u8(src[7])); + uint16x8_t acc_7_h = vmovl_u8(vget_high_u8(src[7])); + + uint16x8_t acc_1_13_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[13])); + uint16x8_t acc_1_13_h = + vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[13])); + + uint16x8_t acc_2_12_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[12])); + uint16x8_t acc_2_12_h = + vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[12])); + + uint16x8_t acc_6_8_l = vaddl_u8(vget_low_u8(src[6]), vget_low_u8(src[8])); + uint16x8_t acc_6_8_h = vaddl_u8(vget_high_u8(src[6]), vget_high_u8(src[8])); + + uint16x8_t acc_5_9_l = vaddl_u8(vget_low_u8(src[5]), vget_low_u8(src[9])); + uint16x8_t acc_5_9_h = vaddl_u8(vget_high_u8(src[5]), vget_high_u8(src[9])); + + uint16x8_t acc_0_14_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[14])); + uint16x8_t acc_0_14_h = + vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[14])); + + uint16x8_t acc_3_11_l = vaddl_u8(vget_low_u8(src[3]), vget_low_u8(src[11])); + uint16x8_t acc_3_11_h = + vaddl_u8(vget_high_u8(src[3]), vget_high_u8(src[11])); + + uint16x8_t acc_4_10_l = vaddl_u8(vget_low_u8(src[4]), vget_low_u8(src[10])); + uint16x8_t acc_4_10_h = + vaddl_u8(vget_high_u8(src[4]), vget_high_u8(src[10])); + + acc_0_14_l = vshlq_n_u16(acc_0_14_l, 2); + acc_0_14_h = vshlq_n_u16(acc_0_14_h, 2); + + acc_3_11_l = vshlq_n_u16(acc_3_11_l, 2); + acc_3_11_h = vshlq_n_u16(acc_3_11_h, 2); + + acc_4_10_l = vmulq_u16(acc_4_10_l, const_81_u16_); + acc_4_10_h = vmulq_u16(acc_4_10_h, const_81_u16_); + + uint16x8_t acc_1_3_11_13_l = vaddq_u16(acc_3_11_l, acc_1_13_l); + uint16x8_t acc_1_3_11_13_h = vaddq_u16(acc_3_11_h, acc_1_13_h); + acc_1_3_11_13_l = vmlaq_u16(acc_3_11_l, acc_1_3_11_13_l, const_11_u16_); + acc_1_3_11_13_h = vmlaq_u16(acc_3_11_h, acc_1_3_11_13_h, const_11_u16_); + + uint16x8_t acc_0_1_3_11_13_14_l = vaddq_u16(acc_1_3_11_13_l, acc_0_14_l); + uint16x8_t acc_0_1_3_11_13_14_h = vaddq_u16(acc_1_3_11_13_h, acc_0_14_h); + + uint16x8_t acc_2_4_10_12_l = + vmlaq_u16(acc_4_10_l, acc_2_12_l, const_25_u16_); + uint16x8_t acc_2_4_10_12_h = + vmlaq_u16(acc_4_10_h, acc_2_12_h, const_25_u16_); + + uint32x4x4_t acc = {{ + vaddl_u16(vget_low_u16(acc_2_4_10_12_l), + vget_low_u16(acc_0_1_3_11_13_14_l)), + vaddl_u16(vget_high_u16(acc_2_4_10_12_l), + vget_high_u16(acc_0_1_3_11_13_14_l)), + vaddl_u16(vget_low_u16(acc_2_4_10_12_h), + vget_low_u16(acc_0_1_3_11_13_14_h)), + vaddl_u16(vget_high_u16(acc_2_4_10_12_h), + vget_high_u16(acc_0_1_3_11_13_14_h)), + }}; + + acc.val[0] = + vmlal_u16(acc.val[0], vget_low_u16(acc_6_8_l), const_146_u16_half_); + acc.val[1] = + vmlal_u16(acc.val[1], vget_high_u16(acc_6_8_l), const_146_u16_half_); + acc.val[2] = + vmlal_u16(acc.val[2], vget_low_u16(acc_6_8_h), const_146_u16_half_); + acc.val[3] = + vmlal_u16(acc.val[3], vget_high_u16(acc_6_8_h), const_146_u16_half_); + + acc.val[0] = + vmlal_u16(acc.val[0], vget_low_u16(acc_5_9_l), const_118_u16_half_); + acc.val[1] = + vmlal_u16(acc.val[1], vget_high_u16(acc_5_9_l), const_118_u16_half_); + acc.val[2] = + vmlal_u16(acc.val[2], vget_low_u16(acc_5_9_h), const_118_u16_half_); + acc.val[3] = + vmlal_u16(acc.val[3], vget_high_u16(acc_5_9_h), const_118_u16_half_); + + acc.val[0] = + vmlal_u16(acc.val[0], vget_low_u16(acc_7_l), const_158_u16_half_); + acc.val[1] = + vmlal_u16(acc.val[1], vget_high_u16(acc_7_l), const_158_u16_half_); + acc.val[2] = + vmlal_u16(acc.val[2], vget_low_u16(acc_7_h), const_158_u16_half_); + acc.val[3] = + vmlal_u16(acc.val[3], vget_high_u16(acc_7_h), const_158_u16_half_); + + vst1q_u32_x4(&dst[0], acc); + } + + // Applies vertical filtering vector using scalar operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void vertical_scalar_path(const SourceType src[15], BufferType *dst) const { + uint32_t acc = (static_cast(src[3]) + src[11]) * 4; + acc += (acc + src[1] + src[13]) * 11; + acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + + (src[4] + src[10]) * 81; + acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; + dst[0] = acc; + } + + // Applies horizontal filtering vector using SIMD operations. + // + // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void horizontal_vector_path(uint32x4_t src[15], DestinationType *dst) const { + uint32x4_t acc_1_13 = vaddq_u32(src[1], src[13]); + uint32x4_t acc_2_12 = vaddq_u32(src[2], src[12]); + uint32x4_t acc_6_8 = vaddq_u32(src[6], src[8]); + uint32x4_t acc_5_9 = vaddq_u32(src[5], src[9]); + uint32x4_t acc_0_14 = vaddq_u32(src[0], src[14]); + uint32x4_t acc_3_11 = vaddq_u32(src[3], src[11]); + uint32x4_t acc_4_10 = vaddq_u32(src[4], src[10]); + + acc_0_14 = vshlq_n_u32(acc_0_14, 2); + acc_3_11 = vshlq_n_u32(acc_3_11, 2); + acc_4_10 = vmulq_u32(acc_4_10, const_81_u32_); + + uint32x4_t acc_1_3_11_13 = vaddq_u32(acc_3_11, acc_1_13); + acc_1_3_11_13 = vmlaq_u32(acc_3_11, acc_1_3_11_13, const_11_u32_); + uint32x4_t acc_0_1_3_11_13_14 = vaddq_u32(acc_1_3_11_13, acc_0_14); + uint32x4_t acc_2_4_10_12 = vmlaq_u32(acc_4_10, acc_2_12, const_25_u32_); + + uint32x4_t acc = vaddq_u32(acc_2_4_10_12, acc_0_1_3_11_13_14); + acc = vmlaq_u32(acc, acc_6_8, const_146_u32_); + acc = vmlaq_u32(acc, acc_5_9, const_118_u32_); + acc = vmlaq_u32(acc, src[7], const_158_u32_); + acc = vrshrq_n_u32(acc, 20); + + uint16x4_t narrowed = vmovn_u32(acc); + uint8x8_t interleaved = + vuzp1_u8(vreinterpret_u8_u16(narrowed), vreinterpret_u8_u16(narrowed)); + uint32_t result = vget_lane_u32(vreinterpret_u32_u8(interleaved), 0); + memcpy(&dst[0], &result, sizeof(result)); + } + + // Applies horizontal filtering vector using scalar operations. + // + // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void horizontal_scalar_path(const BufferType src[15], + DestinationType *dst) const { + uint32_t acc = (static_cast(src[3]) + src[11]) * 4; + acc += (acc + src[1] + src[13]) * 11; + acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + + (src[4] + src[10]) * 81; + acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; + dst[0] = rounding_shift_right(acc, 20); + } + + private: + uint16x8_t const_11_u16_; + uint32x4_t const_11_u32_; + uint16x8_t const_25_u16_; + uint32x4_t const_25_u32_; + uint16x8_t const_81_u16_; + uint32x4_t const_81_u32_; + uint16x4_t const_118_u16_half_; + uint32x4_t const_118_u32_; + uint16x4_t const_146_u16_half_; + uint32x4_t const_146_u32_; + uint16x4_t const_158_u16_half_; + uint32x4_t const_158_u32_; +}; // end of class DiscreteGaussianBlur + template kleidicv_error_t discrete_gaussian_blur(const ScalarType *src, size_t src_stride, ScalarType *dst, @@ -312,7 +535,11 @@ kleidicv_error_t discrete_gaussian_blur(const ScalarType *src, auto *workspace = reinterpret_cast(context); - if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { + if constexpr (KernelSize == 15) { + if (workspace->intermediate_size() != 4 * sizeof(ScalarType)) { + return KLEIDICV_ERROR_CONTEXT_MISMATCH; + } + } else if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } @@ -373,4 +600,16 @@ kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); +} + } // namespace kleidicv::neon diff --git a/kleidicv/src/filters/gaussian_blur_sme2.cpp b/kleidicv/src/filters/gaussian_blur_sme2.cpp index 33a2dd09d..f463cf1be 100644 --- a/kleidicv/src/filters/gaussian_blur_sme2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sme2.cpp @@ -37,4 +37,11 @@ gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, border_type, context); } +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +gaussian_blur_15x15_u8(const uint8_t *, size_t, uint8_t *, size_t, size_t, + size_t, size_t, kleidicv_border_type_t, + kleidicv_filter_context_t *) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; +} + } // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_sve2.cpp b/kleidicv/src/filters/gaussian_blur_sve2.cpp index 7ae808d33..76a87c3cf 100644 --- a/kleidicv/src/filters/gaussian_blur_sve2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sve2.cpp @@ -43,4 +43,12 @@ kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *, size_t, uint8_t *, + size_t, size_t, size_t, size_t, + kleidicv_border_type_t, + kleidicv_filter_context_t *) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; +} + } // namespace kleidicv::sve2 diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index da9040dbb..700f64fb1 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -18,6 +18,7 @@ KLEIDICV_GAUSSIAN_BLUR(uint8_t, 3x3, u8); KLEIDICV_GAUSSIAN_BLUR(uint8_t, 5x5, u8); KLEIDICV_GAUSSIAN_BLUR(uint8_t, 7x7, u8); +KLEIDICV_GAUSSIAN_BLUR(uint8_t, 15x15, u8); // Implements KernelTestParams for Gaussian Blur operators. template @@ -26,7 +27,7 @@ struct GaussianBlurKernelTestParams; template struct GaussianBlurKernelTestParams { using InputType = uint8_t; - using IntermediateType = uint32_t; + using IntermediateType = uint64_t; using OutputType = uint8_t; static constexpr size_t kKernelSize = KernelSize; @@ -107,12 +108,18 @@ class GaussianBlurTest : public test::KernelTest { auto api = KernelTestParams::kKernelSize == 3 ? gaussian_blur_3x3() : KernelTestParams::kKernelSize == 5 ? gaussian_blur_5x5() - : gaussian_blur_7x7(); + : KernelTestParams::kKernelSize == 7 ? gaussian_blur_7x7() + : gaussian_blur_15x15(); // NOLINTEND(readability-avoid-nested-conditional-operator) + size_t intermediate_size = 2 * sizeof(InputType); + if constexpr (KernelTestParams::kKernelSize == 15) { + intermediate_size = 4 * sizeof(InputType); + } + kleidicv_filter_context_t *context = nullptr; auto ret = kleidicv_filter_create( - &context, input->channels(), 2 * sizeof(InputType), + &context, input->channels(), intermediate_size, kleidicv_rectangle_t{input->width() / input->channels(), input->height()}); if (ret != KLEIDICV_OK) { @@ -136,7 +143,8 @@ class GaussianBlurTest : public test::KernelTest { // NOLINTBEGIN(readability-avoid-nested-conditional-operator) return kernel.width() == 3 ? ((result + 8) / 16) : kernel.width() == 5 ? ((result + 128) / 256) - : ((result + 2048) / 4096); + : kernel.width() == 7 ? ((result + 2048) / 4096) + : ((result + 524288) / 1048576); // NOLINTEND(readability-avoid-nested-conditional-operator) } }; // end of class GaussianBlurTest @@ -217,6 +225,33 @@ TYPED_TEST(GaussianBlur, 7x7) { .test(mask); } +// Tests gaussian_blur_15x15_ API. +TYPED_TEST(GaussianBlur, 15x15) { + using KernelTestParams = GaussianBlurKernelTestParams; + // 15x15 GaussianBlur operator. + test::Array2D mask{15, 15}; + // clang-format off + mask.set(0, 0, { 16, 44, 100, 192, 324, 472, 584, 632, 584, 472, 324, 192, 100, 44, 16 }); + mask.set(1, 0, { 44, 121, 275, 528, 891, 1298, 1606, 1738, 1606, 1298, 891, 528, 275, 121, 44 }); + mask.set(2, 0, { 100, 275, 625, 1200, 2025, 2950, 3650, 3950, 3650, 2950, 2025, 1200, 625, 275, 100 }); + mask.set(3, 0, { 192, 528, 1200, 2304, 3888, 5664, 7008, 7584, 7008, 5664, 3888, 2304, 1200, 528, 192 }); + mask.set(4, 0, { 324, 891, 2025, 3888, 6561, 9558, 11826, 12798, 11826, 9558, 6561, 3888, 2025, 891, 324 }); + mask.set(5, 0, { 472, 1298, 2950, 5664, 9558, 13924, 17228, 18644, 17228, 13924, 9558, 5664, 2950, 1298, 472 }); + mask.set(6, 0, { 584, 1606, 3650, 7008, 11826, 17228, 21316, 23068, 21316, 17228, 11826, 7008, 3650, 1606, 584 }); + mask.set(7, 0, { 632, 1738, 3950, 7584, 12798, 18644, 23068, 24964, 23068, 18644, 12798, 7584, 3950, 1738, 632 }); + mask.set(8, 0, { 584, 1606, 3650, 7008, 11826, 17228, 21316, 23068, 21316, 17228, 11826, 7008, 3650, 1606, 584 }); + mask.set(9, 0, { 472, 1298, 2950, 5664, 9558, 13924, 17228, 18644, 17228, 13924, 9558, 5664, 2950, 1298, 472 }); + mask.set(10, 0, { 324, 891, 2025, 3888, 6561, 9558, 11826, 12798, 11826, 9558, 6561, 3888, 2025, 891, 324 }); + mask.set(11, 0, { 192, 528, 1200, 2304, 3888, 5664, 7008, 7584, 7008, 5664, 3888, 2304, 1200, 528, 192 }); + mask.set(12, 0, { 100, 275, 625, 1200, 2025, 2950, 3650, 3950, 3650, 2950, 2025, 1200, 625, 275, 100 }); + mask.set(13, 0, { 44, 121, 275, 528, 891, 1298, 1606, 1738, 1606, 1298, 891, 528, 275, 121, 44 }); + mask.set(14, 0, { 16, 44, 100, 192, 324, 472, 584, 632, 584, 472, 324, 192, 100, 44, 16 }); + // clang-format on + GaussianBlurTest{} + .with_border_types(make_generator_ptr(kAllBorders)) + .test(mask); +} + TYPED_TEST(GaussianBlur, UnsupportedBorderType3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; @@ -280,10 +315,32 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType7x7) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, UnsupportedBorderType15x15) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + TypeParam src[1] = {}, dst[1]; + for (kleidicv_border_type_t border : { + KLEIDICV_BORDER_TYPE_CONSTANT, + KLEIDICV_BORDER_TYPE_TRANSPARENT, + KLEIDICV_BORDER_TYPE_NONE, + }) { + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_15x15()(src, sizeof(TypeParam), dst, + sizeof(TypeParam), validSize, + validSize, 1, border, context)); + } + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, NullPointer) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; using KernelTestParams7x7 = GaussianBlurKernelTestParams; + using KernelTestParams15x15 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; ASSERT_EQ(KLEIDICV_OK, @@ -301,6 +358,10 @@ TYPED_TEST(GaussianBlur, NullPointer) { test::test_null_args(gaussian_blur_7x7(), src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context); + validSize = KernelTestParams15x15::kKernelSize - 1; + test::test_null_args(gaussian_blur_15x15(), src, sizeof(TypeParam), + dst, sizeof(TypeParam), validSize, validSize, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, context); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -312,6 +373,7 @@ TYPED_TEST(GaussianBlur, Misalignment) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; using KernelTestParams7x7 = GaussianBlurKernelTestParams; + using KernelTestParams15x15 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; ASSERT_EQ(KLEIDICV_OK, @@ -344,6 +406,15 @@ TYPED_TEST(GaussianBlur, Misalignment) { gaussian_blur_7x7()( src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + validSize = KernelTestParams15x15::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + gaussian_blur_15x15()( + src, sizeof(TypeParam) + 1, dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -413,6 +484,28 @@ TYPED_TEST(GaussianBlur, ZeroImageSize7x7) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, ZeroImageSize15x15) { + TypeParam src[1] = {}, dst[1]; + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{0, 1})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 0, 1, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{1, 0})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 1, 0, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, ValidImageSize3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; @@ -482,6 +575,39 @@ TYPED_TEST(GaussianBlur, ValidImageSize7x7) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, ValidImageSize15x15) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + test::Array2D src{validSize, validSize, + test::Options::vector_length()}; + src.set(0, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(1, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(2, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(3, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(4, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(5, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(6, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(7, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(8, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(9, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(10, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(11, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(12, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + src.set(13, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + + test::Array2D dst{validSize, validSize, + test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, + gaussian_blur_15x15()( + src.data(), src.stride(), dst.data(), dst.stride(), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REVERSE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, UndersizeImage3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; @@ -581,6 +707,39 @@ TYPED_TEST(GaussianBlur, UndersizeImage7x7) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, UndersizeImage15x15) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t underSize = KernelTestParams::kKernelSize - 2; + size_t validWidth = KernelTestParams::kKernelSize + 10; + size_t validHeight = KernelTestParams::kKernelSize + 5; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{underSize, underSize})); + TypeParam src[1] = {}, dst[1]; + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, + underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{underSize, validHeight})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, + validHeight, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{validWidth, underSize})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validWidth, + underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, OversizeImage) { kleidicv_filter_context_t *context = nullptr; ASSERT_EQ(KLEIDICV_OK, @@ -617,6 +776,16 @@ TYPED_TEST(GaussianBlur, OversizeImage) { src, sizeof(TypeParam), dst, sizeof(TypeParam), KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -624,6 +793,7 @@ TYPED_TEST(GaussianBlur, ChannelNumber) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; using KernelTestParams7x7 = GaussianBlurKernelTestParams; + using KernelTestParams15x15 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -650,6 +820,13 @@ TYPED_TEST(GaussianBlur, ChannelNumber) { src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, KLEIDICV_MAXIMUM_CHANNEL_COUNT + 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams15x15::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, KLEIDICV_MAXIMUM_CHANNEL_COUNT + 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -657,6 +834,7 @@ TYPED_TEST(GaussianBlur, InvalidContextSizeType) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; using KernelTestParams7x7 = GaussianBlurKernelTestParams; + using KernelTestParams15x15 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -678,6 +856,11 @@ TYPED_TEST(GaussianBlur, InvalidContextSizeType) { gaussian_blur_7x7()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + validSize = KernelTestParams15x15::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -685,6 +868,7 @@ TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; using KernelTestParams7x7 = GaussianBlurKernelTestParams; + using KernelTestParams15x15 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -708,6 +892,12 @@ TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { gaussian_blur_7x7()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams15x15::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -715,6 +905,7 @@ TYPED_TEST(GaussianBlur, InvalidContextImageSize) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; using KernelTestParams7x7 = GaussianBlurKernelTestParams; + using KernelTestParams15x15 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -738,6 +929,12 @@ TYPED_TEST(GaussianBlur, InvalidContextImageSize) { gaussian_blur_7x7()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams15x15::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_15x15()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, + validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } -- GitLab From c474bad326e1ca275e06a0e6cfb628e9f24a6e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 30 May 2024 18:34:22 +0200 Subject: [PATCH 3/8] Add implementation for SVE2/SME2 15x15 Gaussian blur --- .../kleidicv/separable_filter_15x15_sc.h | 266 ++++++++++++++++++ kleidicv/src/filters/gaussian_blur_sc.h | 173 +++++++++++- kleidicv/src/filters/gaussian_blur_sme2.cpp | 11 +- kleidicv/src/filters/gaussian_blur_sve2.cpp | 14 +- 4 files changed, 454 insertions(+), 10 deletions(-) create mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_sc.h diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h new file mode 100644 index 000000000..981953843 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h @@ -0,0 +1,266 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_SC_H +#define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_15x15.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 15x15 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{7UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c3())[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c4())[index]); + SourceVectorType src_5 = + svld1(pg, &src_rows.at(border_offsets.c5())[index]); + SourceVectorType src_6 = + svld1(pg, &src_rows.at(border_offsets.c6())[index]); + SourceVectorType src_7 = + svld1(pg, &src_rows.at(border_offsets.c7())[index]); + SourceVectorType src_8 = + svld1(pg, &src_rows.at(border_offsets.c8())[index]); + SourceVectorType src_9 = + svld1(pg, &src_rows.at(border_offsets.c9())[index]); + SourceVectorType src_10 = + svld1(pg, &src_rows.at(border_offsets.c10())[index]); + SourceVectorType src_11 = + svld1(pg, &src_rows.at(border_offsets.c11())[index]); + SourceVectorType src_12 = + svld1(pg, &src_rows.at(border_offsets.c12())[index]); + SourceVectorType src_13 = + svld1(pg, &src_rows.at(border_offsets.c13())[index]); + SourceVectorType src_14 = + svld1(pg, &src_rows.at(border_offsets.c14())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, src_7, src_8, src_9, src_10, src_11, + src_12, src_13, src_14, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; + auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; + auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; + auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; + auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; + auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; + auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; + auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + BufferVectorType src_0_7 = svld1(pg, &src_7[0]); + BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1); + BufferVectorType src_0_8 = svld1(pg, &src_8[0]); + BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1); + BufferVectorType src_0_9 = svld1(pg, &src_9[0]); + BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1); + BufferVectorType src_0_10 = svld1(pg, &src_10[0]); + BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1); + BufferVectorType src_0_11 = svld1(pg, &src_11[0]); + BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1); + BufferVectorType src_0_12 = svld1(pg, &src_12[0]); + BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1); + BufferVectorType src_0_13 = svld1(pg, &src_13[0]); + BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1); + BufferVectorType src_0_14 = svld1(pg, &src_14[0]); + BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, src_0_5, src_0_6, src_0_7, src_0_8, + src_0_9, src_0_10, src_0_11, src_0_12, + src_0_13, src_0_14, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, + src_1_14, &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + BufferVectorType src_5 = + svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); + BufferVectorType src_6 = + svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); + BufferVectorType src_7 = + svld1(pg, &src_rows.at(0, border_offsets.c7())[index]); + BufferVectorType src_8 = + svld1(pg, &src_rows.at(0, border_offsets.c8())[index]); + BufferVectorType src_9 = + svld1(pg, &src_rows.at(0, border_offsets.c9())[index]); + BufferVectorType src_10 = + svld1(pg, &src_rows.at(0, border_offsets.c10())[index]); + BufferVectorType src_11 = + svld1(pg, &src_rows.at(0, border_offsets.c11())[index]); + BufferVectorType src_12 = + svld1(pg, &src_rows.at(0, border_offsets.c12())[index]); + BufferVectorType src_13 = + svld1(pg, &src_rows.at(0, border_offsets.c13())[index]); + BufferVectorType src_14 = + svld1(pg, &src_rows.at(0, border_offsets.c14())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, src_7, src_8, src_9, src_10, src_11, + src_12, src_13, src_14, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[15]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[7] = src_rows.at(0, border_offsets.c7())[index]; + src[8] = src_rows.at(0, border_offsets.c8())[index]; + src[9] = src_rows.at(0, border_offsets.c9())[index]; + src[10] = src_rows.at(0, border_offsets.c10())[index]; + src[11] = src_rows.at(0, border_offsets.c11())[index]; + src[12] = src_rows.at(0, border_offsets.c12())[index]; + src[13] = src_rows.at(0, border_offsets.c13())[index]; + src[14] = src_rows.at(0, border_offsets.c14())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 15x15 separable filters driver type. +template +using SeparableFilter15x15 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_15X15_SC_H diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index a01b03d2e..96b8c5dc7 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -8,6 +8,7 @@ #include #include "kleidicv/kleidicv.h" +#include "kleidicv/separable_filter_15x15_sc.h" #include "kleidicv/separable_filter_3x3_sc.h" #include "kleidicv/separable_filter_5x5_sc.h" #include "kleidicv/separable_filter_7x7_sc.h" @@ -263,6 +264,172 @@ class DiscreteGaussianBlur { } }; // end of class DiscreteGaussianBlur +// Template for 15x15 Gaussian Blur approximation filters. +// +// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] +// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] +// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] +// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] +// F = 1/1048576 * [ | | | | ... | | | | ] = +// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] +// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] +// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] +// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] +// +// [ 4 ] +// [ 11 ] +// [ 25 ] +// [ 48 ] +// [ 81 ] +// [ 118 ] +// [ 146 ] +// = 1/1048576 * [ 158 ] * [4,11,25,48,81,118,146,158,146,118,81,48,25,11,4] +// [ 146 ] +// [ 118 ] +// [ 81 ] +// [ 48 ] +// [ 25 ] +// [ 11 ] +// [ 4 ] +template <> +class DiscreteGaussianBlur { + public: + using SourceType = uint8_t; + using BufferType = uint32_t; + using DestinationType = uint8_t; + + // Applies vertical filtering vector using SIMD operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void vertical_vector_path( + svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, + svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, + svuint8_t src_7, svuint8_t src_8, svuint8_t src_9, svuint8_t src_10, + svuint8_t src_11, svuint8_t src_12, svuint8_t src_13, svuint8_t src_14, + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_7_b = svmovlb_u16(src_7); + svuint16_t acc_7_t = svmovlt_u16(src_7); + + svuint16_t acc_1_13_b = svaddlb_u16(src_1, src_13); + svuint16_t acc_1_13_t = svaddlt_u16(src_1, src_13); + + svuint16_t acc_2_12_b = svaddlb_u16(src_2, src_12); + svuint16_t acc_2_12_t = svaddlt_u16(src_2, src_12); + + svuint16_t acc_6_8_b = svaddlb_u16(src_6, src_8); + svuint16_t acc_6_8_t = svaddlt_u16(src_6, src_8); + + svuint16_t acc_5_9_b = svaddlb_u16(src_5, src_9); + svuint16_t acc_5_9_t = svaddlt_u16(src_5, src_9); + + svuint16_t acc_0_14_b = svaddlb_u16(src_0, src_14); + svuint16_t acc_0_14_t = svaddlt_u16(src_0, src_14); + + svuint16_t acc_3_11_b = svaddlb_u16(src_3, src_11); + svuint16_t acc_3_11_t = svaddlt_u16(src_3, src_11); + + svuint16_t acc_4_10_b = svaddlb_u16(src_4, src_10); + svuint16_t acc_4_10_t = svaddlt_u16(src_4, src_10); + + acc_0_14_b = svlsl_n_u16_x(pg, acc_0_14_b, 2); + acc_0_14_t = svlsl_n_u16_x(pg, acc_0_14_t, 2); + + acc_3_11_b = svlsl_n_u16_x(pg, acc_3_11_b, 2); + acc_3_11_t = svlsl_n_u16_x(pg, acc_3_11_t, 2); + + acc_4_10_b = svmul_n_u16_x(pg, acc_4_10_b, 81); + acc_4_10_t = svmul_n_u16_x(pg, acc_4_10_t, 81); + + svuint16_t acc_1_3_11_13_b = svadd_u16_x(pg, acc_3_11_b, acc_1_13_b); + svuint16_t acc_1_3_11_13_t = svadd_u16_x(pg, acc_3_11_t, acc_1_13_t); + acc_1_3_11_13_b = svmla_n_u16_x(pg, acc_3_11_b, acc_1_3_11_13_b, 11); + acc_1_3_11_13_t = svmla_n_u16_x(pg, acc_3_11_t, acc_1_3_11_13_t, 11); + + svuint16_t acc_0_1_3_11_13_14_b = + svadd_u16_x(pg, acc_1_3_11_13_b, acc_0_14_b); + svuint16_t acc_0_1_3_11_13_14_t = + svadd_u16_x(pg, acc_1_3_11_13_t, acc_0_14_t); + + svuint16_t acc_2_4_10_12_b = svmla_n_u16_x(pg, acc_4_10_b, acc_2_12_b, 25); + svuint16_t acc_2_4_10_12_t = svmla_n_u16_x(pg, acc_4_10_t, acc_2_12_t, 25); + + svuint32_t acc_b_b = svaddlb_u32(acc_2_4_10_12_b, acc_0_1_3_11_13_14_b); + svuint32_t acc_b_t = svaddlb_u32(acc_2_4_10_12_t, acc_0_1_3_11_13_14_t); + svuint32_t acc_t_b = svaddlt_u32(acc_2_4_10_12_b, acc_0_1_3_11_13_14_b); + svuint32_t acc_t_t = svaddlt_u32(acc_2_4_10_12_t, acc_0_1_3_11_13_14_t); + + acc_b_b = svmlalb_n_u32(acc_b_b, acc_6_8_b, 146); + acc_b_t = svmlalb_n_u32(acc_b_t, acc_6_8_t, 146); + acc_t_b = svmlalt_n_u32(acc_t_b, acc_6_8_b, 146); + acc_t_t = svmlalt_n_u32(acc_t_t, acc_6_8_t, 146); + + acc_b_b = svmlalb_n_u32(acc_b_b, acc_5_9_b, 118); + acc_b_t = svmlalb_n_u32(acc_b_t, acc_5_9_t, 118); + acc_t_b = svmlalt_n_u32(acc_t_b, acc_5_9_b, 118); + acc_t_t = svmlalt_n_u32(acc_t_t, acc_5_9_t, 118); + + acc_b_b = svmlalb_n_u32(acc_b_b, acc_7_b, 158); + acc_b_t = svmlalb_n_u32(acc_b_t, acc_7_t, 158); + acc_t_b = svmlalt_n_u32(acc_t_b, acc_7_b, 158); + acc_t_t = svmlalt_n_u32(acc_t_t, acc_7_t, 158); + + svuint32x4_t interleaved = + svcreate4_u32(acc_b_b, acc_b_t, acc_t_b, acc_t_t); + svst4_u32(pg, &dst[0], interleaved); + } + + // Applies horizontal filtering vector using SIMD operations. + // + // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void horizontal_vector_path( + svbool_t pg, svuint32_t src_0, svuint32_t src_1, svuint32_t src_2, + svuint32_t src_3, svuint32_t src_4, svuint32_t src_5, svuint32_t src_6, + svuint32_t src_7, svuint32_t src_8, svuint32_t src_9, svuint32_t src_10, + svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, + svuint32_t src_14, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint32_t acc_1_13 = svadd_u32_x(pg, src_1, src_13); + svuint32_t acc_2_12 = svadd_u32_x(pg, src_2, src_12); + svuint32_t acc_6_8 = svadd_u32_x(pg, src_6, src_8); + svuint32_t acc_5_9 = svadd_u32_x(pg, src_5, src_9); + svuint32_t acc_0_14 = svadd_u32_x(pg, src_0, src_14); + svuint32_t acc_3_11 = svadd_u32_x(pg, src_3, src_11); + svuint32_t acc_4_10 = svadd_u32_x(pg, src_4, src_10); + + acc_0_14 = svlsl_n_u32_x(pg, acc_0_14, 2); + acc_3_11 = svlsl_n_u32_x(pg, acc_3_11, 2); + acc_4_10 = svmul_n_u32_x(pg, acc_4_10, 81); + + svuint32_t acc_1_3_11_13 = svadd_u32_x(pg, acc_3_11, acc_1_13); + acc_1_3_11_13 = svmla_n_u32_x(pg, acc_3_11, acc_1_3_11_13, 11); + svuint32_t acc_0_1_3_11_13_14 = svadd_u32_x(pg, acc_1_3_11_13, acc_0_14); + svuint32_t acc_2_4_10_12 = svmla_n_u32_x(pg, acc_4_10, acc_2_12, 25); + + svuint32_t acc = svadd_u32_x(pg, acc_2_4_10_12, acc_0_1_3_11_13_14); + acc = svmla_n_u32_x(pg, acc, acc_6_8, 146); + acc = svmla_n_u32_x(pg, acc, acc_5_9, 118); + acc = svmla_n_u32_x(pg, acc, src_7, 158); + acc = svrshr_n_u32_x(pg, acc, 20); + svst1b_u32(pg, &dst[0], acc); + } + + // Applies horizontal filtering vector using scalar operations. + // + // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * + // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T + void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) + const KLEIDICV_STREAMING_COMPATIBLE { + uint32_t acc = (static_cast(src[3]) + src[11]) * 4; + acc += (acc + src[1] + src[13]) * 11; + acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + + (src[4] + src[10]) * 81; + acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; + dst[0] = rounding_shift_right(acc, 20); + } +}; // end of class DiscreteGaussianBlur + template kleidicv_error_t discrete_gaussian_blur( const ScalarType *src, size_t src_stride, ScalarType *dst, @@ -290,7 +457,11 @@ kleidicv_error_t discrete_gaussian_blur( auto *workspace = reinterpret_cast(context); - if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { + if constexpr (KernelSize == 15) { + if (workspace->intermediate_size() != 4 * sizeof(ScalarType)) { + return KLEIDICV_ERROR_CONTEXT_MISMATCH; + } + } else if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } diff --git a/kleidicv/src/filters/gaussian_blur_sme2.cpp b/kleidicv/src/filters/gaussian_blur_sme2.cpp index f463cf1be..ec9c6700e 100644 --- a/kleidicv/src/filters/gaussian_blur_sme2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sme2.cpp @@ -38,10 +38,13 @@ gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, } KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t -gaussian_blur_15x15_u8(const uint8_t *, size_t, uint8_t *, size_t, size_t, - size_t, size_t, kleidicv_border_type_t, - kleidicv_filter_context_t *) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; +gaussian_blur_15x15_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t channels, kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); } } // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_sve2.cpp b/kleidicv/src/filters/gaussian_blur_sve2.cpp index 76a87c3cf..1e872e2f4 100644 --- a/kleidicv/src/filters/gaussian_blur_sve2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sve2.cpp @@ -44,11 +44,15 @@ kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, } KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *, size_t, uint8_t *, - size_t, size_t, size_t, size_t, - kleidicv_border_type_t, - kleidicv_filter_context_t *) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; +kleidicv_error_t gaussian_blur_15x15_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); } } // namespace kleidicv::sve2 -- GitLab From 67e9ca01ac9b954971864ca0d1f115a5eb00e2d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 29 May 2024 16:24:23 +0200 Subject: [PATCH 4/8] Add benchmarks for 15x15 Gaussian blur --- benchmark/benchmark.cpp | 19 +++++++++++++++---- scripts/benchmark/run_benchmarks_4K.sh | 2 ++ scripts/benchmark/run_benchmarks_FHD.sh | 2 ++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 9da671360..1af3daece 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -224,11 +224,12 @@ static void resize_linear_4x4_f32(benchmark::State& state) { } BENCHMARK(resize_linear_4x4_f32); -template +template static void gaussian_blur(Function f, benchmark::State& state) { kleidicv_filter_context_t* context; kleidicv_error_t err = - kleidicv_filter_create(&context, Channels, 2 * sizeof(T), + kleidicv_filter_create(&context, Channels, WideningMultiplier * sizeof(T), kleidicv_rectangle_t{image_width, image_height}); if (err != KLEIDICV_OK) { state.SkipWithError("Could not initialize Gaussian blur filter."); @@ -247,11 +248,21 @@ static void gaussian_blur(Function f, benchmark::State& state) { } static void gaussian_blur_7x7_u8_1ch(benchmark::State& state) { - gaussian_blur(kleidicv_gaussian_blur_7x7_u8, state); + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, state); } BENCHMARK(gaussian_blur_7x7_u8_1ch); static void gaussian_blur_7x7_u8_3ch(benchmark::State& state) { - gaussian_blur(kleidicv_gaussian_blur_7x7_u8, state); + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, state); } BENCHMARK(gaussian_blur_7x7_u8_3ch); + +static void gaussian_blur_15x15_u8_1ch(benchmark::State& state) { + gaussian_blur(kleidicv_gaussian_blur_15x15_u8, state); +} +BENCHMARK(gaussian_blur_15x15_u8_1ch); + +static void gaussian_blur_15x15_u8_3ch(benchmark::State& state) { + gaussian_blur(kleidicv_gaussian_blur_15x15_u8, state); +} +BENCHMARK(gaussian_blur_15x15_u8_3ch); diff --git a/scripts/benchmark/run_benchmarks_4K.sh b/scripts/benchmark/run_benchmarks_4K.sh index 592147fc4..3582b7bfe 100755 --- a/scripts/benchmark/run_benchmarks_4K.sh +++ b/scripts/benchmark/run_benchmarks_4K.sh @@ -37,6 +37,8 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur3x3 opencv_perf_imgproc '*gaussianBlur3x3*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur5x5 opencv_perf_imgproc '*gaussianBlur5x5*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur7x7 opencv_perf_imgproc '*gaussianBlur7x7*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur15x15 opencv_perf_imgproc '*gaussianBlur15x15*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL Sobel_Gx opencv_perf_imgproc '*Border3x3_sobelFilter*' '(3840x2160, 16SC1, (1, 0), BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL Sobel_Gy opencv_perf_imgproc '*Border3x3_sobelFilter*' '(3840x2160, 16SC1, (0, 1), BORDER_REPLICATE)')") diff --git a/scripts/benchmark/run_benchmarks_FHD.sh b/scripts/benchmark/run_benchmarks_FHD.sh index bfc8430e0..533a17759 100755 --- a/scripts/benchmark/run_benchmarks_FHD.sh +++ b/scripts/benchmark/run_benchmarks_FHD.sh @@ -37,6 +37,8 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur3x3 opencv_perf_imgproc '*gaussianBlur3x3*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur5x5 opencv_perf_imgproc '*gaussianBlur5x5*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur7x7 opencv_perf_imgproc '*gaussianBlur7x7*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur15x15 opencv_perf_imgproc '*gaussianBlur15x15*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL Sobel_Gx opencv_perf_imgproc '*Border3x3_sobelFilter*' '(1920x1080, 16SC1, (1, 0), BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL Sobel_Gy opencv_perf_imgproc '*Border3x3_sobelFilter*' '(1920x1080, 16SC1, (0, 1), BORDER_REPLICATE)')") -- GitLab From e9c8c43e4ebab64ba04af9e93fcd71f4da4a22ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 30 May 2024 12:30:38 +0200 Subject: [PATCH 5/8] Update CHANGELOG.md, doc/functionality.md and doc/opencv.md --- CHANGELOG.md | 1 + doc/functionality.md | 12 ++++++------ doc/opencv.md | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c192adc87..0e83884eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ This changelog aims to follow the guiding principles of - Exponential function for float. - Bitwise and. - Gaussian Blur for 7x7 kernels. +- Gaussian Blur for 15x15 kernels. - Scale function for float. - Add, subtract, multiply & absdiff enabled in OpenCV HAL. - MinMax enabled in OpenCV HAL, float version added. diff --git a/doc/functionality.md b/doc/functionality.md index ed0cbc923..483ddb29c 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -64,12 +64,12 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Transpose | x | x | x | x | ## Image filters -| | u8 | -|-------------------------------|-----| -| Erode | x | -| Dilate | x | -| Sobel (3x3) | x | -| Gaussian Blur (3x3, 5x5, 7x7) | x | +| | u8 | +|--------------------------------------|-----| +| Erode | x | +| Dilate | x | +| Sobel (3x3) | x | +| Gaussian Blur (3x3, 5x5, 7x7, 15x15) | x | ## Resize to quarter | | u8 | diff --git a/doc/opencv.md b/doc/opencv.md index 8b2c8c83d..ea7b97936 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -109,7 +109,7 @@ Currently does not support non-zero margins. Kernel shape is restricted to squar Notes on parameters: * `depth` - only supports `CV_8U` depth. * `width`,`height` - Image width and height should be greater than or equal to the size of the kernel in the given direction. -* `ksize_width == ksize_height` - kernel size. Only 3x3, 5x5 and 7x7 kernels are supported. +* `ksize_width == ksize_height` - kernel size. Only 3x3, 5x5, 7x7 and 15x15 kernels are supported. * `border_type` - pixel extrapolation method. Supported [OpenCV border types](https://docs.opencv.org/5.x/d2/de8/group__core__array.html#ga209f2f4869e304c82d07739337eae7c5) are: + `cv::BORDER_REPLICATE` -- GitLab From 1f9306b4e4eb803cb4ff31b2ee770cffcfecae03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Mon, 3 Jun 2024 14:07:04 +0200 Subject: [PATCH 6/8] Fix loop unrolling when processing borders This is intended to fix cases when the compiler is unable to unroll loops that are processing left and right borders. More specifically: - For Clang: move the constexpr margin variable definition inside the private process_horizontal method - For GCC: use size_t instead of the Margin object in separable filters --- .../kleidicv/separable_filter_15x15_neon.h | 2 +- .../kleidicv/separable_filter_15x15_sc.h | 4 +-- .../kleidicv/separable_filter_3x3_neon.h | 2 +- .../kleidicv/separable_filter_3x3_sc.h | 4 +-- .../kleidicv/separable_filter_5x5_neon.h | 2 +- .../kleidicv/separable_filter_5x5_sc.h | 4 +-- .../kleidicv/separable_filter_7x7_neon.h | 2 +- .../kleidicv/separable_filter_7x7_sc.h | 4 +-- .../include/kleidicv/workspace/separable.h | 26 ++++++++----------- 9 files changed, 19 insertions(+), 31 deletions(-) diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h index 425769eee..2475d1db3 100644 --- a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h @@ -32,7 +32,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) : filter_{filter} {} - static constexpr Margin margin() { return Margin{7UL}; } + static constexpr size_t margin = 7UL; void process_vertical(size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h index 981953843..f95067a09 100644 --- a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h @@ -36,9 +36,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{7UL}; - } + static constexpr size_t margin = 7UL; void process_vertical( size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h index ec03c40ea..3fecea047 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -32,7 +32,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) : filter_{filter} {} - static constexpr Margin margin() { return Margin{1UL}; } + static constexpr size_t margin = 1UL; void process_vertical(size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h index 42c178b02..6f624ae1c 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -36,9 +36,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{1UL}; - } + static constexpr size_t margin = 1UL; void process_vertical( size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h index 2694bc3bd..34f4290d7 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -32,7 +32,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) : filter_{filter} {} - static constexpr Margin margin() { return Margin{2UL}; } + static constexpr size_t margin = 2UL; void process_vertical(size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h index 54f8c1aa6..909e8ce18 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -36,9 +36,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{2UL}; - } + static constexpr size_t margin = 2UL; void process_vertical( size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h index 0b71b237a..4305d9d06 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -32,7 +32,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) : filter_{filter} {} - static constexpr Margin margin() { return Margin{3UL}; } + static constexpr size_t margin = 3UL; void process_vertical(size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h index 630340856..33f204a10 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -36,9 +36,7 @@ class SeparableFilter { explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{3UL}; - } + static constexpr size_t margin = 3UL; void process_vertical( size_t width, Rows src_rows, Rows dst_rows, diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index b8d036a07..6a501686e 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -138,9 +138,6 @@ class SeparableFilterWorkspace final { &data_[buffer_rows_offset_]), buffer_rows_stride_, channels}; - // Margin associated with the filter. - constexpr Margin margin = filter.margin(); - // Vertical processing loop. for (size_t vertical_index = 0; vertical_index < rect.height(); ++vertical_index) { @@ -151,7 +148,7 @@ class SeparableFilterWorkspace final { buffer_rows, offsets); // Process in the horizontal direction last. process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), - margin, filter, horizontal_border); + filter, horizontal_border); } } @@ -160,14 +157,15 @@ class SeparableFilterWorkspace final { void process_horizontal(size_t width, Rows buffer_rows, Rows dst_rows, - Margin margin, FilterType filter, + FilterType filter, typename FilterType::BorderInfoType horizontal_border) KLEIDICV_STREAMING_COMPATIBLE { + // Margin associated with the filter. + constexpr size_t margin = filter.margin; + // Process data affected by left border. -#ifdef __clang__ // GCC is unable to unroll the loop KLEIDICV_FORCE_LOOP_UNROLL -#endif - for (size_t horizontal_index = 0; horizontal_index < margin.left(); + for (size_t horizontal_index = 0; horizontal_index < margin; ++horizontal_index) { auto offsets = horizontal_border.offsets_with_left_border(horizontal_index); @@ -178,20 +176,18 @@ class SeparableFilterWorkspace final { // Process data which is not affected by any borders in bulk. { - size_t width_without_borders = width - margin.left() - margin.right(); + size_t width_without_borders = width - (2 * margin); auto offsets = horizontal_border.offsets_without_border(); filter.process_horizontal(width_without_borders, - buffer_rows.at(0, margin.left()), - dst_rows.at(0, margin.left()), offsets); + buffer_rows.at(0, margin), + dst_rows.at(0, margin), offsets); } // Process data affected by right border. -#ifdef __clang__ // GCC is unable to unroll the loop KLEIDICV_FORCE_LOOP_UNROLL -#endif - for (size_t horizontal_index = 0; horizontal_index < margin.right(); + for (size_t horizontal_index = 0; horizontal_index < margin; ++horizontal_index) { - size_t index = width - margin.right() + horizontal_index; + size_t index = width - margin + horizontal_index; auto offsets = horizontal_border.offsets_with_right_border(index); filter.process_horizontal_borders(buffer_rows.at(0, index), dst_rows.at(0, index), offsets); -- GitLab From 08f79304a1c47e72a33c057b37039537c560244c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Mon, 10 Jun 2024 13:19:43 +0200 Subject: [PATCH 7/8] Add entry to small_array_layouts in the test framework This ensures that the horizontal "unroll once" case in the 15x15 NEON separable filter is being accessed for code coverage purposes. --- test/api/test_gaussian_blur.cpp | 2 +- test/api/test_morphology.cpp | 2 +- test/framework/utils.cpp | 3 ++- test/framework/utils.h | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 700f64fb1..ea367d61d 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -95,7 +95,7 @@ class GaussianBlurTest : public test::KernelTest { } protected: - std::array small_array_layouts_; + std::array small_array_layouts_; std::unique_ptr> array_layout_generator_; std::unique_ptr> border_type_generator_; diff --git a/test/api/test_morphology.cpp b/test/api/test_morphology.cpp index c8035abd8..19f998ef8 100644 --- a/test/api/test_morphology.cpp +++ b/test/api/test_morphology.cpp @@ -125,7 +125,7 @@ class MorphologyTest test::Array2D mask_; test::Kernel kernel_; size_t iterations_; - std::array small_array_layouts_; + std::array small_array_layouts_; std::unique_ptr> array_layout_generator_; std::unique_ptr> border_type_generator_; diff --git a/test/framework/utils.cpp b/test/framework/utils.cpp index bd090d73a..2fa001a8d 100644 --- a/test/framework/utils.cpp +++ b/test/framework/utils.cpp @@ -59,7 +59,7 @@ std::array default_border_values() { }}; } -std::array small_array_layouts(size_t min_width, +std::array small_array_layouts(size_t min_width, size_t min_height) { size_t vl = test::Options::vector_length(); size_t width = std::max(min_width, vl); @@ -70,6 +70,7 @@ std::array small_array_layouts(size_t min_width, { min_width, min_height, 0, 1}, { min_width * 2, min_height, 0, 2}, { min_width * 3, min_height, vl, 3}, + { min_width * 3, min_height, 0, 1}, { width + 1, min_height, 0, 1}, { 2 * width, min_height, vl, 1}, { 4 * width, min_height, vl, 1}, diff --git a/test/framework/utils.h b/test/framework/utils.h index e5f0e46b0..a624db73c 100644 --- a/test/framework/utils.h +++ b/test/framework/utils.h @@ -116,7 +116,7 @@ void dump(const TwoDimensional *elements); std::array default_border_values(); // Returns an array of just a few small layouts. -std::array small_array_layouts(size_t min_width, +std::array small_array_layouts(size_t min_width, size_t min_height); // Returns an array of default tested layouts. std::array default_array_layouts(size_t min_width, -- GitLab From 86ae51884f80b359ff5c211947192cf1dd74fa59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 13 Jun 2024 14:43:52 +0200 Subject: [PATCH 8/8] Adjust intermediate size in 15x15 context mismatch tests This should fix the missing branch coverage in gaussian_blur_neon.cpp and gaussian_blur_sc.h. --- test/api/test_gaussian_blur.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index ea367d61d..ff9abc94e 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -856,6 +856,12 @@ TYPED_TEST(GaussianBlur, InvalidContextSizeType) { gaussian_blur_7x7()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam) + 1, + kleidicv_rectangle_t{validSize, validSize})); + validSize = KernelTestParams15x15::kKernelSize - 1; EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, gaussian_blur_15x15()( @@ -893,6 +899,11 @@ TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 2, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + validSize = KernelTestParams15x15::kKernelSize - 1; EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, gaussian_blur_15x15()( @@ -930,6 +941,11 @@ TYPED_TEST(GaussianBlur, InvalidContextImageSize) { src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 4 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + validSize = KernelTestParams15x15::kKernelSize - 1; EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, gaussian_blur_15x15()( -- GitLab