From e0439cc39230a5078210868704ef0218479653c7 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 21 May 2025 12:32:36 +0000 Subject: [PATCH 1/2] Fix log text when skipping long-running MedianBlur tests --- test/api/test_median_blur.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/api/test_median_blur.cpp b/test/api/test_median_blur.cpp index 60469ab2a..1bb982aed 100644 --- a/test/api/test_median_blur.cpp +++ b/test/api/test_median_blur.cpp @@ -221,7 +221,9 @@ TYPED_TEST_SUITE(MedianBlurTest, ElementTypes); TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithoutPadding) { if (test::Options::are_long_running_tests_skipped()) { - GTEST_SKIP() << "Long running exp test skipped"; + GTEST_SKIP() << "Long running test " + "MedianBlurTest::RunAllParamCombinationsWithoutPadding " + "skipped"; } for (const auto& params : TestFixture::get_unpadded_test_cases()) { @@ -340,7 +342,10 @@ TYPED_TEST_SUITE(MedianBlurByteStrideTest, ByteStrideTypes); TYPED_TEST(MedianBlurByteStrideTest, RunAllParamCombinationsWithPadding) { if (test::Options::are_long_running_tests_skipped()) { - GTEST_SKIP() << "Long running exp test skipped"; + GTEST_SKIP() + << "Long running test " + "MedianBlurByteStrideTest::RunAllParamCombinationsWithPadding " + "skipped"; } for (const auto& params : TestFixture::get_padded_test_Cases()) { -- GitLab From dc2aa129b0939fcfe2646dae29a7f697b8a5e4b5 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 21 May 2025 12:32:56 +0000 Subject: [PATCH 2/2] Implement GaussianBlur 21x21 kernel --- CHANGELOG.md | 1 + benchmark/benchmark.cpp | 7 + conformity/opencv/test_gaussian_blur.cpp | 33 +- doc/functionality.md | 16 +- doc/opencv.md | 2 +- .../include/kleidicv/filters/gaussian_blur.h | 1 + .../filters/separable_filter_21x21_neon.h | 231 ++++++++++ .../filters/separable_filter_21x21_sc.h | 316 ++++++++++++++ kleidicv/include/kleidicv/kleidicv.h | 2 +- .../include/kleidicv/workspace/border_21x21.h | 411 ++++++++++++++++++ kleidicv/src/filters/gaussian_blur_neon.cpp | 7 + kleidicv/src/filters/gaussian_blur_sc.h | 201 ++++++++- scripts/benchmark/benchmarks.txt | 1 + test/api/test_gaussian_blur.cpp | 70 +++ 14 files changed, 1283 insertions(+), 16 deletions(-) create mode 100644 kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h create mode 100644 kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h create mode 100644 kleidicv/include/kleidicv/workspace/border_21x21.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dc6739e4..f92c7f228 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ This changelog aims to follow the guiding principles of ### Added - Median Blur for 5x5 kernels. +- Gaussian Blur for 21x21 kernels. ## 0.4.0 - 2025-03-25 diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 6d74b75b6..66fee6e5c 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -365,12 +365,19 @@ static void gaussian_blur(benchmark::State& state) { BENCH_GAUSSIAN_BLUR(3, 1); BENCH_GAUSSIAN_BLUR(3, 3); +BENCH_GAUSSIAN_BLUR(3, 4); BENCH_GAUSSIAN_BLUR(5, 1); BENCH_GAUSSIAN_BLUR(5, 3); +BENCH_GAUSSIAN_BLUR(5, 4); BENCH_GAUSSIAN_BLUR(7, 1); BENCH_GAUSSIAN_BLUR(7, 3); +BENCH_GAUSSIAN_BLUR(7, 4); BENCH_GAUSSIAN_BLUR(15, 1); BENCH_GAUSSIAN_BLUR(15, 3); +BENCH_GAUSSIAN_BLUR(15, 4); +BENCH_GAUSSIAN_BLUR(21, 1); +BENCH_GAUSSIAN_BLUR(21, 3); +BENCH_GAUSSIAN_BLUR(21, 4); template static void median_blur(benchmark::State& state, Function func) { diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index d40bddd67..d6eef1236 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -29,9 +29,9 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, size_t size_min = 5; size_t size_max = 16; - if constexpr (KernelSize == 15) { - size_min = 14; - size_max = 32; + if constexpr (KernelSize >= 15) { + size_min = KernelSize - 1; + size_max = 2 * KernelSize + 2; } for (size_t y = size_min; y <= size_max; ++y) { @@ -61,7 +61,7 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, // between the OpenCV and KleidiCV implementations that use // the 15x15 kernel size, so we ignore any non-matching // values that fall within the specified threshold. - if constexpr (KernelSize == 15) { + if constexpr (KernelSize >= 15) { threshold = 2; } @@ -178,6 +178,31 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 15x15, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 15x15, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 15x15, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 1 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 1, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), + + TEST("Gaussian blur 21x21, BORDER_REFLECT, 1 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT, 2 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT, 3 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 21x21, BORDER_REFLECT, 4 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), + + TEST("Gaussian blur 21x21, BORDER_WRAP, 1 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), + TEST("Gaussian blur 21x21, BORDER_WRAP, 2 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), + TEST("Gaussian blur 21x21, BORDER_WRAP, 3 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), + TEST("Gaussian blur 21x21, BORDER_WRAP, 4 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), + + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 61926df93..81bb0f09e 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -72,14 +72,14 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Rotate (90 degrees clockwise) | x | x | x | x | ## Image filters -| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | -|--------------------------------------|-----|-----|-----|-----|-----|-----|-----| -| Erode | | x | | | | | | -| Dilate | | x | | | | | | -| Sobel (3x3) | | x | | | | | | -| Separable Filter 2D (5x5) | | x | x | x | | | | -| Gaussian Blur (3x3, 5x5, 7x7, 15x15) | | x | | | | | | -| Median Blur (5x5) | x | x | x | x | x | x | x | +| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | +|---------------------------------------------|-----|-----|-----|-----|-----|-----|-----| +| Erode | | x | | | | | | +| Dilate | | x | | | | | | +| Sobel (3x3) | | x | | | | | | +| Separable Filter 2D (5x5) | | x | x | x | | | | +| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | +| Median Blur (5x5) | x | x | x | x | x | x | x | ## Resize to quarter | | u8 | diff --git a/doc/opencv.md b/doc/opencv.md index f62512c11..113252fe6 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -108,7 +108,7 @@ In-place filtering is not supported i.e. `src` and `dst` must be different (non- Notes on parameters: * `src.depth()` - only supports `CV_8U` depth. * `src.cols`,`src.rows` - should be greater than or equal to the size of the kernel in the given direction. -* `ksize` - supported kernel sizes are 3x3, 5x5, 7x7 and 15x15. +* `ksize` - supported kernel sizes are 3x3, 5x5, 7x7, 15x15 and 21x21. * `sigmaX`, `sigmaY` - optimal performance is achieved if these are set to 0. * `borderType` - supported [OpenCV border types](https://docs.opencv.org/4.11.0/d2/de8/group__core__array.html#ga209f2f4869e304c82d07739337eae7c5) are: + `cv::BORDER_REPLICATE` diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index 3c1823d0f..68e5e1d3c 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -46,6 +46,7 @@ inline bool gaussian_blur_is_implemented(size_t width, size_t height, case 5: case 7: case 15: + case 21: break; default: return false; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h new file mode 100644 index 000000000..27edb8e0b --- /dev/null +++ b/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h @@ -0,0 +1,231 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_21X21_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_21X21_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_21x21.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 21x21 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo21x21; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = 10UL; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[21]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); + src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]); + src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]); + src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]); + src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]); + src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]); + src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]); + src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]); + src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]); + src[15] = vld1q(&src_rows.at(border_offsets.c15())[index]); + src[16] = vld1q(&src_rows.at(border_offsets.c16())[index]); + src[17] = vld1q(&src_rows.at(border_offsets.c17())[index]); + src[18] = vld1q(&src_rows.at(border_offsets.c18())[index]); + src[19] = vld1q(&src_rows.at(border_offsets.c19())[index]); + src[20] = vld1q(&src_rows.at(border_offsets.c20())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + // No tail path needed in NEON, because TryToAvoidTailPath works for any + // supported size (i.e. the minimum size is kernel_size - 1, which is 20, + // and the NEON vector length is 16 which is smaller than that). + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; + auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; + auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; + auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; + auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; + auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; + auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; + auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; + auto src_15 = &src_rows.at(0, border_offsets.c15())[index]; + auto src_16 = &src_rows.at(0, border_offsets.c16())[index]; + auto src_17 = &src_rows.at(0, border_offsets.c17())[index]; + auto src_18 = &src_rows.at(0, border_offsets.c18())[index]; + auto src_19 = &src_rows.at(0, border_offsets.c19())[index]; + auto src_20 = &src_rows.at(0, border_offsets.c20())[index]; + + BufferVectorType src_a[21], src_b[21]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_5[0]); + src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_6[0]); + src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); + src_a[7] = vld1q(&src_7[0]); + src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]); + src_a[8] = vld1q(&src_8[0]); + src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]); + src_a[9] = vld1q(&src_9[0]); + src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]); + src_a[10] = vld1q(&src_10[0]); + src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]); + src_a[11] = vld1q(&src_11[0]); + src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]); + src_a[12] = vld1q(&src_12[0]); + src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]); + src_a[13] = vld1q(&src_13[0]); + src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]); + src_a[14] = vld1q(&src_14[0]); + src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]); + src_a[15] = vld1q(&src_15[0]); + src_b[15] = vld1q(&src_15[BufferVecTraits::num_lanes()]); + src_a[16] = vld1q(&src_16[0]); + src_b[16] = vld1q(&src_16[BufferVecTraits::num_lanes()]); + src_a[17] = vld1q(&src_17[0]); + src_b[17] = vld1q(&src_17[BufferVecTraits::num_lanes()]); + src_a[18] = vld1q(&src_18[0]); + src_b[18] = vld1q(&src_18[BufferVecTraits::num_lanes()]); + src_a[19] = vld1q(&src_19[0]); + src_b[19] = vld1q(&src_19[BufferVecTraits::num_lanes()]); + src_a[20] = vld1q(&src_20[0]); + src_b[20] = vld1q(&src_20[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[21]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); + src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]); + src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]); + src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]); + src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]); + src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]); + src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]); + src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]); + src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]); + src[15] = vld1q(&src_rows.at(0, border_offsets.c15())[index]); + src[16] = vld1q(&src_rows.at(0, border_offsets.c16())[index]); + src[17] = vld1q(&src_rows.at(0, border_offsets.c17())[index]); + src[18] = vld1q(&src_rows.at(0, border_offsets.c18())[index]); + src[19] = vld1q(&src_rows.at(0, border_offsets.c19())[index]); + src[20] = vld1q(&src_rows.at(0, border_offsets.c20())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[21]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[7] = src_rows.at(0, border_offsets.c7())[index]; + src[8] = src_rows.at(0, border_offsets.c8())[index]; + src[9] = src_rows.at(0, border_offsets.c9())[index]; + src[10] = src_rows.at(0, border_offsets.c10())[index]; + src[11] = src_rows.at(0, border_offsets.c11())[index]; + src[12] = src_rows.at(0, border_offsets.c12())[index]; + src[13] = src_rows.at(0, border_offsets.c13())[index]; + src[14] = src_rows.at(0, border_offsets.c14())[index]; + src[15] = src_rows.at(0, border_offsets.c15())[index]; + src[16] = src_rows.at(0, border_offsets.c16())[index]; + src[17] = src_rows.at(0, border_offsets.c17())[index]; + src[18] = src_rows.at(0, border_offsets.c18())[index]; + src[19] = src_rows.at(0, border_offsets.c19())[index]; + src[20] = src_rows.at(0, border_offsets.c20())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 21x21 separable filters driver type. +template +using SeparableFilter21x21 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_21X21_NEON_H diff --git a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h new file mode 100644 index 000000000..a9ee82a9c --- /dev/null +++ b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h @@ -0,0 +1,316 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_21X21_SC_H +#define KLEIDICV_SEPARABLE_FILTER_21X21_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_21x21.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 21x21 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo21x21; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr size_t margin = 10UL; + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c3())[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c4())[index]); + SourceVectorType src_5 = + svld1(pg, &src_rows.at(border_offsets.c5())[index]); + SourceVectorType src_6 = + svld1(pg, &src_rows.at(border_offsets.c6())[index]); + SourceVectorType src_7 = + svld1(pg, &src_rows.at(border_offsets.c7())[index]); + SourceVectorType src_8 = + svld1(pg, &src_rows.at(border_offsets.c8())[index]); + SourceVectorType src_9 = + svld1(pg, &src_rows.at(border_offsets.c9())[index]); + SourceVectorType src_10 = + svld1(pg, &src_rows.at(border_offsets.c10())[index]); + SourceVectorType src_11 = + svld1(pg, &src_rows.at(border_offsets.c11())[index]); + SourceVectorType src_12 = + svld1(pg, &src_rows.at(border_offsets.c12())[index]); + SourceVectorType src_13 = + svld1(pg, &src_rows.at(border_offsets.c13())[index]); + SourceVectorType src_14 = + svld1(pg, &src_rows.at(border_offsets.c14())[index]); + SourceVectorType src_15 = + svld1(pg, &src_rows.at(border_offsets.c15())[index]); + SourceVectorType src_16 = + svld1(pg, &src_rows.at(border_offsets.c16())[index]); + SourceVectorType src_17 = + svld1(pg, &src_rows.at(border_offsets.c17())[index]); + SourceVectorType src_18 = + svld1(pg, &src_rows.at(border_offsets.c18())[index]); + SourceVectorType src_19 = + svld1(pg, &src_rows.at(border_offsets.c19())[index]); + SourceVectorType src_20 = + svld1(pg, &src_rows.at(border_offsets.c20())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, src_7, src_8, src_9, src_10, src_11, + src_12, src_13, src_14, src_15, src_16, src_17, + src_18, src_19, src_20, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; + auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; + auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; + auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; + auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; + auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; + auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; + auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; + auto src_15 = &src_rows.at(0, border_offsets.c15())[index]; + auto src_16 = &src_rows.at(0, border_offsets.c16())[index]; + auto src_17 = &src_rows.at(0, border_offsets.c17())[index]; + auto src_18 = &src_rows.at(0, border_offsets.c18())[index]; + auto src_19 = &src_rows.at(0, border_offsets.c19())[index]; + auto src_20 = &src_rows.at(0, border_offsets.c20())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + BufferVectorType src_0_7 = svld1(pg, &src_7[0]); + BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1); + BufferVectorType src_0_8 = svld1(pg, &src_8[0]); + BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1); + BufferVectorType src_0_9 = svld1(pg, &src_9[0]); + BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1); + BufferVectorType src_0_10 = svld1(pg, &src_10[0]); + BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1); + BufferVectorType src_0_11 = svld1(pg, &src_11[0]); + BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1); + BufferVectorType src_0_12 = svld1(pg, &src_12[0]); + BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1); + BufferVectorType src_0_13 = svld1(pg, &src_13[0]); + BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1); + BufferVectorType src_0_14 = svld1(pg, &src_14[0]); + BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); + BufferVectorType src_0_15 = svld1(pg, &src_15[0]); + BufferVectorType src_1_15 = svld1_vnum(pg, &src_15[0], 1); + BufferVectorType src_0_16 = svld1(pg, &src_16[0]); + BufferVectorType src_1_16 = svld1_vnum(pg, &src_16[0], 1); + BufferVectorType src_0_17 = svld1(pg, &src_17[0]); + BufferVectorType src_1_17 = svld1_vnum(pg, &src_17[0], 1); + BufferVectorType src_0_18 = svld1(pg, &src_18[0]); + BufferVectorType src_1_18 = svld1_vnum(pg, &src_18[0], 1); + BufferVectorType src_0_19 = svld1(pg, &src_19[0]); + BufferVectorType src_1_19 = svld1_vnum(pg, &src_19[0], 1); + BufferVectorType src_0_20 = svld1(pg, &src_20[0]); + BufferVectorType src_1_20 = svld1_vnum(pg, &src_20[0], 1); + + filter_.horizontal_vector_path( + pg, src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6, + src_0_7, src_0_8, src_0_9, src_0_10, src_0_11, src_0_12, src_0_13, + src_0_14, src_0_15, src_0_16, src_0_17, src_0_18, src_0_19, src_0_20, + &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, + src_1_14, src_1_15, src_1_16, src_1_17, src_1_18, src_1_19, src_1_20, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + BufferVectorType src_5 = + svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); + BufferVectorType src_6 = + svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); + BufferVectorType src_7 = + svld1(pg, &src_rows.at(0, border_offsets.c7())[index]); + BufferVectorType src_8 = + svld1(pg, &src_rows.at(0, border_offsets.c8())[index]); + BufferVectorType src_9 = + svld1(pg, &src_rows.at(0, border_offsets.c9())[index]); + BufferVectorType src_10 = + svld1(pg, &src_rows.at(0, border_offsets.c10())[index]); + BufferVectorType src_11 = + svld1(pg, &src_rows.at(0, border_offsets.c11())[index]); + BufferVectorType src_12 = + svld1(pg, &src_rows.at(0, border_offsets.c12())[index]); + BufferVectorType src_13 = + svld1(pg, &src_rows.at(0, border_offsets.c13())[index]); + BufferVectorType src_14 = + svld1(pg, &src_rows.at(0, border_offsets.c14())[index]); + BufferVectorType src_15 = + svld1(pg, &src_rows.at(0, border_offsets.c15())[index]); + BufferVectorType src_16 = + svld1(pg, &src_rows.at(0, border_offsets.c16())[index]); + BufferVectorType src_17 = + svld1(pg, &src_rows.at(0, border_offsets.c17())[index]); + BufferVectorType src_18 = + svld1(pg, &src_rows.at(0, border_offsets.c18())[index]); + BufferVectorType src_19 = + svld1(pg, &src_rows.at(0, border_offsets.c19())[index]); + BufferVectorType src_20 = + svld1(pg, &src_rows.at(0, border_offsets.c20())[index]); + filter_.horizontal_vector_path( + pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, + src_9, src_10, src_11, src_12, src_13, src_14, src_15, src_16, src_17, + src_18, src_19, src_20, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[21]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[7] = src_rows.at(0, border_offsets.c7())[index]; + src[8] = src_rows.at(0, border_offsets.c8())[index]; + src[9] = src_rows.at(0, border_offsets.c9())[index]; + src[10] = src_rows.at(0, border_offsets.c10())[index]; + src[11] = src_rows.at(0, border_offsets.c11())[index]; + src[12] = src_rows.at(0, border_offsets.c12())[index]; + src[13] = src_rows.at(0, border_offsets.c13())[index]; + src[14] = src_rows.at(0, border_offsets.c14())[index]; + src[15] = src_rows.at(0, border_offsets.c15())[index]; + src[16] = src_rows.at(0, border_offsets.c16())[index]; + src[17] = src_rows.at(0, border_offsets.c17())[index]; + src[18] = src_rows.at(0, border_offsets.c18())[index]; + src[19] = src_rows.at(0, border_offsets.c19())[index]; + src[20] = src_rows.at(0, border_offsets.c20())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 21x21 separable filters driver type. +template +using SeparableFilter21x21 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_21X21_SC_H diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 9644ac3bb..e1aef97e4 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1212,7 +1212,7 @@ KLEIDICV_API_DECLARATION(kleidicv_canny_u8, const uint8_t *src, /// @param max_image_width Maximum image width. `max_image_width * /// max_image_height` must not be more than @ref /// KLEIDICV_MAX_IMAGE_PIXELS. -/// @param max_image_height Maximum image height. m`ax_image_width * +/// @param max_image_height Maximum image height. `max_image_width * /// max_image_height` must not be more than @ref /// KLEIDICV_MAX_IMAGE_PIXELS. /// diff --git a/kleidicv/include/kleidicv/workspace/border_21x21.h b/kleidicv/include/kleidicv/workspace/border_21x21.h new file mode 100644 index 000000000..df8d4cd13 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_21x21.h @@ -0,0 +1,411 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_21X21_H +#define KLEIDICV_WORKSPACE_BORDER_21X21_H + +#include "border_types.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 21x21 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(ptrdiff_t o0, ptrdiff_t o1, ptrdiff_t o2, ptrdiff_t o3, + ptrdiff_t o4, ptrdiff_t o5, ptrdiff_t o6, ptrdiff_t o7, + ptrdiff_t o8, ptrdiff_t o9, ptrdiff_t o10, ptrdiff_t o11, + ptrdiff_t o12, ptrdiff_t o13, ptrdiff_t o14, ptrdiff_t o15, + ptrdiff_t o16, ptrdiff_t o17, ptrdiff_t o18, ptrdiff_t o19, + ptrdiff_t o20) + : offsets_{o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, + o11, o12, o13, o14, o15, o16, o17, o18, o19, o20} {} + + ptrdiff_t c0() const { return offsets_[0]; } + ptrdiff_t c1() const { return offsets_[1]; } + ptrdiff_t c2() const { return offsets_[2]; } + ptrdiff_t c3() const { return offsets_[3]; } + ptrdiff_t c4() const { return offsets_[4]; } + ptrdiff_t c5() const { return offsets_[5]; } + ptrdiff_t c6() const { return offsets_[6]; } + ptrdiff_t c7() const { return offsets_[7]; } + ptrdiff_t c8() const { return offsets_[8]; } + ptrdiff_t c9() const { return offsets_[9]; } + ptrdiff_t c10() const { return offsets_[10]; } + ptrdiff_t c11() const { return offsets_[11]; } + ptrdiff_t c12() const { return offsets_[12]; } + ptrdiff_t c13() const { return offsets_[13]; } + ptrdiff_t c14() const { return offsets_[14]; } + ptrdiff_t c15() const { return offsets_[15]; } + ptrdiff_t c16() const { return offsets_[16]; } + ptrdiff_t c17() const { return offsets_[17]; } + ptrdiff_t c18() const { return offsets_[18]; } + ptrdiff_t c19() const { return offsets_[19]; } + ptrdiff_t c20() const { return offsets_[20]; } + + private: + ptrdiff_t offsets_[21]; + }; + + FixedBorderInfo(size_t width, FixedBorderType border_type) + : width_(width), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10); + } + + // NOLINTBEGIN(readability-function-cognitive-complexity) + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10); + } else if (column_index == 1) { + return get(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 2) { + return get(-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 3) { + return get(-3, -3, -3, -3, -3, -3, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 4) { + return get(-4, -4, -4, -4, -4, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 5) { + return get(-5, -5, -5, -5, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 6) { + return get(-6, -6, -6, -6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 7) { + return get(-7, -7, -7, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 8) { + return get(-8, -8, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else { + return get(-9, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10); + } else if (column_index == 1) { + return get(7, 6, 5, 4, 3, 2, 1, 0, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10); + } else if (column_index == 2) { + return get(5, 4, 3, 2, 1, 0, -1, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10); + } else if (column_index == 3) { + return get(3, 2, 1, 0, -1, -2, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10); + } else if (column_index == 4) { + return get(1, 0, -1, -2, -3, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10); + } else if (column_index == 5) { + return get(-1, -2, -3, -4, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 6) { + return get(-3, -4, -5, -6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 7) { + return get(-5, -6, -7, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 8) { + return get(-7, -8, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else { + return get(-9, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, + width_ - 6, width_ - 5, width_ - 4, width_ - 3, width_ - 2, + width_ - 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + } else if (column_index == 1) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, + width_ - 6, width_ - 5, width_ - 4, width_ - 3, width_ - 2, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + } else if (column_index == 2) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, + width_ - 6, width_ - 5, width_ - 4, width_ - 3, -2, -1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + } else if (column_index == 3) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, + width_ - 6, width_ - 5, width_ - 4, -3, -2, -1, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10); + } else if (column_index == 4) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, + width_ - 6, width_ - 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 5) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, + width_ - 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10); + } else if (column_index == 6) { + return get(width_ - 10, width_ - 9, width_ - 8, width_ - 7, -6, -5, + -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + } else if (column_index == 7) { + return get(width_ - 10, width_ - 9, width_ - 8, -7, -6, -5, -4, -3, + -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + } else if (column_index == 8) { + return get(width_ - 10, width_ - 9, -8, -7, -6, -5, -4, -3, -2, -1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + } else { + return get(width_ - 10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, + 3, 4, 5, 6, 7, 8, 9, 10); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10); + } else if (column_index == 1) { + return get(8, 7, 6, 5, 4, 3, 2, 1, 0, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10); + } else if (column_index == 2) { + return get(6, 5, 4, 3, 2, 1, 0, -1, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10); + } else if (column_index == 3) { + return get(4, 3, 2, 1, 0, -1, -2, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10); + } else if (column_index == 4) { + return get(2, 1, 0, -1, -2, -3, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10); + } else if (column_index == 5) { + return get(0, -1, -2, -3, -4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10); + } else if (column_index == 6) { + return get(-2, -3, -4, -5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 7) { + return get(-4, -5, -6, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else if (column_index == 8) { + return get(-6, -7, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } else { + return get(-8, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (width_ - 10)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 9); + } else if (column_index == (width_ - 9)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 8, 8); + } else if (column_index == (width_ - 8)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 7, 7, 7); + } else if (column_index == (width_ - 7)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 6, 6, 6, 6); + } else if (column_index == (width_ - 6)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 5, 5, 5, 5, 5); + } else if (column_index == (width_ - 5)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, + 4, 4, 4, 4, 4); + } else if (column_index == (width_ - 4)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 3, + 3, 3, 3, 3, 3); + } else if (column_index == (width_ - 3)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 2, 2, + 2, 2, 2, 2, 2); + } else if (column_index == (width_ - 2)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1); + } else { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (width_ - 10)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 9); + } else if (column_index == (width_ - 9)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 8, 7); + } else if (column_index == (width_ - 8)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 7, 6, 5); + } else if (column_index == (width_ - 7)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 6, 5, 4, 3); + } else if (column_index == (width_ - 6)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 5, 4, 3, 2, 1); + } else if (column_index == (width_ - 5)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, + 3, 2, 1, 0, -1); + } else if (column_index == (width_ - 4)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 2, + 1, 0, -1, -2, -3); + } else if (column_index == (width_ - 3)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 1, 0, + -1, -2, -3, -4, -5); + } else if (column_index == (width_ - 2)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 0, -1, + -2, -3, -4, -5, -6, -7); + } else { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 0, -1, -2, -3, + -4, -5, -6, -7, -8, -9); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (width_ - 10)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10 - width_); + } else if (column_index == (width_ - 9)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9 - width_, 10 - width_); + } else if (column_index == (width_ - 8)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8 - width_, 9 - width_, 10 - width_); + } else if (column_index == (width_ - 7)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7 - width_, 8 - width_, 9 - width_, 10 - width_); + } else if (column_index == (width_ - 6)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6 - width_, 7 - width_, 8 - width_, 9 - width_, + 10 - width_); + } else if (column_index == (width_ - 5)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, + 5 - width_, 6 - width_, 7 - width_, 8 - width_, 9 - width_, + 10 - width_); + } else if (column_index == (width_ - 4)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, + 4 - width_, 5 - width_, 6 - width_, 7 - width_, 8 - width_, + 9 - width_, 10 - width_); + } else if (column_index == (width_ - 3)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, + 3 - width_, 4 - width_, 5 - width_, 6 - width_, 7 - width_, + 8 - width_, 9 - width_, 10 - width_); + } else if (column_index == (width_ - 2)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2 - width_, + 3 - width_, 4 - width_, 5 - width_, 6 - width_, 7 - width_, + 8 - width_, 9 - width_, 10 - width_); + } else { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1 - width_, + 2 - width_, 3 - width_, 4 - width_, 5 - width_, 6 - width_, + 7 - width_, 8 - width_, 9 - width_, 10 - width_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (width_ - 10)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 8); + } else if (column_index == (width_ - 9)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 7, 6); + } else if (column_index == (width_ - 8)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 6, 5, 4); + } else if (column_index == (width_ - 7)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 5, 4, 3, 2); + } else if (column_index == (width_ - 6)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 4, 3, 2, 1, 0); + } else if (column_index == (width_ - 5)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 3, + 2, 1, 0, -1, -2); + } else if (column_index == (width_ - 4)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 2, 1, + 0, -1, -2, -3, -4); + } else if (column_index == (width_ - 3)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 1, 0, -1, + -2, -3, -4, -5, -6); + } else if (column_index == (width_ - 2)) { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 0, -1, -2, + -3, -4, -5, -6, -7, -8); + } else { + return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, -1, -2, -3, -4, + -5, -6, -7, -8, -9, -10); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + // NOLINTEND(readability-function-cognitive-complexity) + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index < 10U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (width_ - 10U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(ptrdiff_t o0, ptrdiff_t o1, ptrdiff_t o2, ptrdiff_t o3, + ptrdiff_t o4, ptrdiff_t o5, ptrdiff_t o6, ptrdiff_t o7, + ptrdiff_t o8, ptrdiff_t o9, ptrdiff_t o10, ptrdiff_t o11, + ptrdiff_t o12, ptrdiff_t o13, ptrdiff_t o14, ptrdiff_t o15, + ptrdiff_t o16, ptrdiff_t o17, ptrdiff_t o18, ptrdiff_t o19, + ptrdiff_t o20) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, + o11, o12, o13, o14, o15, o16, o17, o18, o19, o20}; + } + + size_t width_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 21x21 filter border type. +template +using FixedBorderInfo21x21 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_21X21_H diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index 5503b1eb8..003d8de8a 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -8,6 +8,7 @@ #include "kleidicv/ctypes.h" #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/filters/separable_filter_15x15_neon.h" +#include "kleidicv/filters/separable_filter_21x21_neon.h" #include "kleidicv/filters/separable_filter_3x3_neon.h" #include "kleidicv/filters/separable_filter_5x5_neon.h" #include "kleidicv/filters/separable_filter_7x7_neon.h" @@ -676,6 +677,12 @@ static kleidicv_error_t gaussian_blur(size_t kernel_size, const ScalarType *src, return gaussian_blur_fixed_kernel_size<15, IsBinomial>( src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); + case 21: + // 21x21 does not have a binomial variant + return gaussian_blur_fixed_kernel_size<21, false>( + src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, + sigma, border_type, workspace); + // gaussian_blur_is_implemented checked the kernel size already. // GCOVR_EXCL_START default: diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 87dbb051b..2e65f0317 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -9,12 +9,12 @@ #include #include "kleidicv/filters/separable_filter_15x15_sc.h" +#include "kleidicv/filters/separable_filter_21x21_sc.h" #include "kleidicv/filters/separable_filter_3x3_sc.h" #include "kleidicv/filters/separable_filter_5x5_sc.h" #include "kleidicv/filters/separable_filter_7x7_sc.h" #include "kleidicv/filters/sigma.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" #include "kleidicv/workspace/separable.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -812,6 +812,198 @@ class GaussianBlur final } }; // end of class GaussianBlur +template <> +class GaussianBlur final + : public GaussianBlurNonBinomialBase { + public: + using SourceType = uint8_t; + using BufferType = uint32_t; + using DestinationType = uint8_t; + + explicit GaussianBlur(float sigma) KLEIDICV_STREAMING_COMPATIBLE + : GaussianBlurNonBinomialBase(sigma) {} + + void vertical_vector_path( + svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, + svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, + svuint8_t src_7, svuint8_t src_8, svuint8_t src_9, svuint8_t src_10, + svuint8_t src_11, svuint8_t src_12, svuint8_t src_13, svuint8_t src_14, + svuint8_t src_15, svuint8_t src_16, svuint8_t src_17, svuint8_t src_18, + svuint8_t src_19, svuint8_t src_20, + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg16all = svptrue_b16(); + + // (10) + (9 + 11) + // Need to calculate them in 32 bits, for small sigmas they can be large + svuint16_t acc_10_0 = svmovlb_u16(src_10); + svuint16_t acc_10_1 = svmovlt_u16(src_10); + + svuint32_t acc3_00 = svmullb_n_u32(acc_10_0, half_kernel_[10]); + svuint32_t acc3_10 = svmullt_n_u32(acc_10_0, half_kernel_[10]); + svuint32_t acc3_01 = svmullb_n_u32(acc_10_1, half_kernel_[10]); + svuint32_t acc3_11 = svmullt_n_u32(acc_10_1, half_kernel_[10]); + + svuint16_t acc_9_0 = svaddlb_u16(src_9, src_11); + svuint16_t acc_9_1 = svaddlt_u16(src_9, src_11); + + acc3_00 = svmlalb_n_u32(acc3_00, acc_9_0, half_kernel_[9]); + acc3_10 = svmlalt_n_u32(acc3_10, acc_9_0, half_kernel_[9]); + acc3_01 = svmlalb_n_u32(acc3_01, acc_9_1, half_kernel_[9]); + acc3_11 = svmlalt_n_u32(acc3_11, acc_9_1, half_kernel_[9]); + + // (8 + 12) + (7 + 13) + (6 + 14) + // 16bits are enough for these products, for any sigma + svuint16_t acc_8_0 = svaddlb_u16(src_8, src_12); + svuint16_t acc_8_1 = svaddlt_u16(src_8, src_12); + + svuint16_t mul8_0 = svmul_n_u16_x(pg16all, acc_8_0, half_kernel_[8]); + svuint16_t mul8_1 = svmul_n_u16_x(pg16all, acc_8_1, half_kernel_[8]); + + svuint16_t acc_7_0 = svaddlb_u16(src_7, src_13); + svuint16_t acc_7_1 = svaddlt_u16(src_7, src_13); + + svuint16_t mul7_0 = svmul_n_u16_x(pg16all, acc_7_0, half_kernel_[7]); + svuint16_t mul7_1 = svmul_n_u16_x(pg16all, acc_7_1, half_kernel_[7]); + + svuint16_t acc_6_0 = svaddlb_u16(src_6, src_14); + svuint16_t acc_6_1 = svaddlt_u16(src_6, src_14); + + svuint16_t mul6_0 = svmul_n_u16_x(pg16all, acc_6_0, half_kernel_[6]); + svuint16_t mul6_1 = svmul_n_u16_x(pg16all, acc_6_1, half_kernel_[6]); + + svuint32_t acc2_00 = svaddlb_u32(mul6_0, mul7_0); + svuint32_t acc2_10 = svaddlt_u32(mul6_0, mul7_0); + svuint32_t acc2_01 = svaddlb_u32(mul6_1, mul7_1); + svuint32_t acc2_11 = svaddlt_u32(mul6_1, mul7_1); + + svbool_t pg32all = svptrue_b32(); + acc2_00 = svadd_u32_x(pg32all, acc2_00, svmovlb_u32(mul8_0)); + acc2_10 = svadd_u32_x(pg32all, acc2_10, svmovlt_u32(mul8_0)); + acc2_01 = svadd_u32_x(pg32all, acc2_01, svmovlb_u32(mul8_1)); + acc2_11 = svadd_u32_x(pg32all, acc2_11, svmovlt_u32(mul8_1)); + + // (5 + 15) + (4 + 14) + (3 + 17) + // these fit into 16 bits together with acc0 too, we can save some cycles + svuint16_t acc_5_0 = svaddlb_u16(src_5, src_15); + svuint16_t acc_5_1 = svaddlt_u16(src_5, src_15); + + svuint16_t acc1_0 = svmul_n_u16_x(pg16all, acc_5_0, half_kernel_[5]); + svuint16_t acc1_1 = svmul_n_u16_x(pg16all, acc_5_1, half_kernel_[5]); + + svuint16_t acc_4_0 = svaddlb_u16(src_4, src_16); + svuint16_t acc_4_1 = svaddlt_u16(src_4, src_16); + + acc1_0 = svmla_n_u16_x(pg16all, acc1_0, acc_4_0, half_kernel_[4]); + acc1_1 = svmla_n_u16_x(pg16all, acc1_1, acc_4_1, half_kernel_[4]); + + svuint16_t acc_3_0 = svaddlb_u16(src_3, src_17); + svuint16_t acc_3_1 = svaddlt_u16(src_3, src_17); + + acc1_0 = svmla_n_u16_x(pg16all, acc1_0, acc_3_0, half_kernel_[3]); + acc1_1 = svmla_n_u16_x(pg16all, acc1_1, acc_3_1, half_kernel_[3]); + + // (2 + 18) + (1 + 19) + (0 + 20) + // these fit into 16 bits together with acc1 too, we can save some cycles + svuint16_t acc_2_0 = svaddlb_u16(src_2, src_18); + svuint16_t acc_2_1 = svaddlt_u16(src_2, src_18); + + svuint16_t acc0_0 = svmul_n_u16_x(pg16all, acc_2_0, half_kernel_[2]); + svuint16_t acc0_1 = svmul_n_u16_x(pg16all, acc_2_1, half_kernel_[2]); + + svuint16_t acc_1_0 = svaddlb_u16(src_1, src_19); + svuint16_t acc_1_1 = svaddlt_u16(src_1, src_19); + + acc0_0 = svmla_n_u16_x(pg16all, acc0_0, acc_1_0, half_kernel_[1]); + acc0_1 = svmla_n_u16_x(pg16all, acc0_1, acc_1_1, half_kernel_[1]); + + svuint16_t acc_0_0 = svaddlb_u16(src_0, src_20); + svuint16_t acc_0_1 = svaddlt_u16(src_0, src_20); + + acc0_0 = svmla_n_u16_x(pg16all, acc0_0, acc_0_0, half_kernel_[0]); + acc0_1 = svmla_n_u16_x(pg16all, acc0_1, acc_0_1, half_kernel_[0]); + + // Sum them up + svuint32_t acc_second_00 = svadd_u32_x(pg32all, acc3_00, acc2_00); + svuint32_t acc_second_10 = svadd_u32_x(pg32all, acc3_10, acc2_10); + svuint32_t acc_second_01 = svadd_u32_x(pg32all, acc3_01, acc2_01); + svuint32_t acc_second_11 = svadd_u32_x(pg32all, acc3_11, acc2_11); + + svuint32_t acc_first_00 = svaddlb_u32(acc1_0, acc0_0); + svuint32_t acc_first_10 = svaddlt_u32(acc1_0, acc0_0); + svuint32_t acc_first_01 = svaddlb_u32(acc1_1, acc0_1); + svuint32_t acc_first_11 = svaddlt_u32(acc1_1, acc0_1); + + svuint32_t acc_00 = svadd_u32_x(pg32all, acc_first_00, acc_second_00); + svuint32_t acc_10 = svadd_u32_x(pg32all, acc_first_10, acc_second_10); + svuint32_t acc_01 = svadd_u32_x(pg32all, acc_first_01, acc_second_01); + svuint32_t acc_11 = svadd_u32_x(pg32all, acc_first_11, acc_second_11); + + svuint32x4_t interleaved = svcreate4(acc_00, acc_01, acc_10, acc_11); + svst4(pg, &dst[0], interleaved); + } + + void horizontal_vector_path( + svbool_t pg, svuint32_t src_0, svuint32_t src_1, svuint32_t src_2, + svuint32_t src_3, svuint32_t src_4, svuint32_t src_5, svuint32_t src_6, + svuint32_t src_7, svuint32_t src_8, svuint32_t src_9, svuint32_t src_10, + svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, + svuint32_t src_14, svuint32_t src_15, svuint32_t src_16, + svuint32_t src_17, svuint32_t src_18, svuint32_t src_19, + svuint32_t src_20, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint32_t acc = svmul_n_u32_x(pg, src_10, half_kernel_[10]); + + svuint32_t acc_9_11 = svadd_u32_x(pg, src_9, src_11); + acc = svmla_n_u32_x(pg, acc, acc_9_11, half_kernel_[9]); + + svuint32_t acc_8_12 = svadd_u32_x(pg, src_8, src_12); + acc = svmla_n_u32_x(pg, acc, acc_8_12, half_kernel_[8]); + + svuint32_t acc_7_13 = svadd_u32_x(pg, src_7, src_13); + acc = svmla_n_u32_x(pg, acc, acc_7_13, half_kernel_[7]); + + svuint32_t acc_6_14 = svadd_u32_x(pg, src_6, src_14); + acc = svmla_n_u32_x(pg, acc, acc_6_14, half_kernel_[6]); + + svuint32_t acc_5_15 = svadd_u32_x(pg, src_5, src_15); + acc = svmla_n_u32_x(pg, acc, acc_5_15, half_kernel_[5]); + + svuint32_t acc_4_16 = svadd_u32_x(pg, src_4, src_16); + acc = svmla_n_u32_x(pg, acc, acc_4_16, half_kernel_[4]); + + svuint32_t acc_3_17 = svadd_u32_x(pg, src_3, src_17); + acc = svmla_n_u32_x(pg, acc, acc_3_17, half_kernel_[3]); + + svuint32_t acc_2_18 = svadd_u32_x(pg, src_2, src_18); + acc = svmla_n_u32_x(pg, acc, acc_2_18, half_kernel_[2]); + + svuint32_t acc_1_19 = svadd_u32_x(pg, src_1, src_19); + acc = svmla_n_u32_x(pg, acc, acc_1_19, half_kernel_[1]); + + svuint32_t acc_0_20 = svadd_u32_x(pg, src_0, src_20); + acc = svmla_n_u32_x(pg, acc, acc_0_20, half_kernel_[0]); + + acc = svrshr_n_u32_x(pg, acc, 16); + svst1b_u32(pg, &dst[0], acc); + } + + void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) + const KLEIDICV_STREAMING_COMPATIBLE { + uint32_t acc = (src[0] + src[20]) * half_kernel_[0] + + (src[1] + src[19]) * half_kernel_[1] + + (src[2] + src[18]) * half_kernel_[2] + + (src[3] + src[17]) * half_kernel_[3] + + (src[4] + src[16]) * half_kernel_[4] + + (src[5] + src[15]) * half_kernel_[5] + + (src[6] + src[14]) * half_kernel_[6] + + (src[7] + src[13]) * half_kernel_[7] + + (src[8] + src[12]) * half_kernel_[8] + + (src[9] + src[11]) * half_kernel_[9] + + src[10] * half_kernel_[10]; + dst[0] = static_cast(rounding_shift_right(acc, 16)); + } +}; // end of class GaussianBlur + template static kleidicv_error_t gaussian_blur_fixed_kernel_size( const ScalarType *src, size_t src_stride, ScalarType *dst, @@ -854,7 +1046,12 @@ static kleidicv_error_t gaussian_blur( return gaussian_blur_fixed_kernel_size<15, IsBinomial>( src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); - // gaussian_blur_is_implemented checked the kernel size already. + case 21: + // 21x21 does not have a binomial variant + return gaussian_blur_fixed_kernel_size<21, false>( + src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, + sigma, border_type, workspace); + // gaussian_blur_is_implemented checked the kernel size already. // GCOVR_EXCL_START default: assert(!"kernel size not implemented"); diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 128529bb8..b218b5b3d 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -42,6 +42,7 @@ GaussianBlur3x3_CustomSigma: opencv_perf_imgproc '*gaussianBlur3x3_CustomSigma GaussianBlur5x5_CustomSigma: opencv_perf_imgproc '*gaussianBlur5x5_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur7x7_CustomSigma: opencv_perf_imgproc '*gaussianBlur7x7_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur15x15_CustomSigma: opencv_perf_imgproc '*gaussianBlur15x15_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' +GaussianBlur21x21_CustomSigma: opencv_perf_imgproc '*gaussianBlur21x21_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' Sobel_Gx: opencv_perf_imgproc '*Border3x3_sobelFilter.sobelFilter/*' '($PIXEL_FORMAT, 16SC1, (1, 0), BORDER_REPLICATE)' Sobel_Gy: opencv_perf_imgproc '*Border3x3_sobelFilter.sobelFilter/*' '($PIXEL_FORMAT, 16SC1, (0, 1), BORDER_REPLICATE)' diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 4656bdbc4..2a8c28c17 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -114,6 +114,9 @@ class GaussianBlurTest : public test::KernelTest { if constexpr (KernelTestParams::kKernelSize == 15) { return (result + 524288) / 1048576; } + if constexpr (KernelTestParams::kKernelSize == 21) { + return (result + 32768) / 65536; + } } const ArrayContainerType array_layouts_; @@ -215,6 +218,53 @@ TYPED_TEST(GaussianBlur, 15x15) { GaussianBlurTest{KernelTestParams{}}.test(mask); } +// Tests gaussian_blur_21x21_ API. +TYPED_TEST(GaussianBlur, 21x21) { + using KernelTestParams = GaussianBlurKernelTestParams; + test::Array2D mask{21, 21}; + // clang-format off + mask.set(0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + mask.set(1, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); + mask.set(2, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); + mask.set(3, 0, {0, 8, 8, 16, 24, 44, 60, 80, 100, 112, 120, 112, 100, 80, 60, 44, 24, 16, 8, 8, 0}); + mask.set(4, 0, {0, 12, 12, 24, 36, 66, 90, 120, 150, 168, 180, 168, 150, 120, 90, 66, 36, 24, 12, 12, 0}); + mask.set(5, 0, {0, 22, 22, 44, 66, 121, 165, 220, 275, 308, 330, 308, 275, 220, 165, 121, 66, 44, 22, 22, 0}); + mask.set(6, 0, {0, 30, 30, 60, 90, 165, 225, 300, 375, 420, 450, 420, 375, 300, 225, 165, 90, 60, 30, 30, 0}); + mask.set(7, 0, {0, 40, 40, 80, 120, 220, 300, 400, 500, 560, 600, 560, 500, 400, 300, 220, 120, 80, 40, 40, 0}); + mask.set(8, 0, {0, 50, 50, 100, 150, 275, 375, 500, 625, 700, 750, 700, 625, 500, 375, 275, 150, 100, 50, 50, 0}); + mask.set(9, 0, {0, 56, 56, 112, 168, 308, 420, 560, 700, 784, 840, 784, 700, 560, 420, 308, 168, 112, 56, 56, 0}); + mask.set(10, 0, {0, 60, 60, 120, 180, 330, 450, 600, 750, 840, 900, 840, 750, 600, 450, 330, 180, 120, 60, 60, 0}); + mask.set(11, 0, {0, 56, 56, 112, 168, 308, 420, 560, 700, 784, 840, 784, 700, 560, 420, 308, 168, 112, 56, 56, 0}); + mask.set(12, 0, {0, 50, 50, 100, 150, 275, 375, 500, 625, 700, 750, 700, 625, 500, 375, 275, 150, 100, 50, 50, 0}); + mask.set(13, 0, {0, 40, 40, 80, 120, 220, 300, 400, 500, 560, 600, 560, 500, 400, 300, 220, 120, 80, 40, 40, 0}); + mask.set(14, 0, {0, 30, 30, 60, 90, 165, 225, 300, 375, 420, 450, 420, 375, 300, 225, 165, 90, 60, 30, 30, 0}); + mask.set(15, 0, {0, 22, 22, 44, 66, 121, 165, 220, 275, 308, 330, 308, 275, 220, 165, 121, 66, 44, 22, 22, 0}); + mask.set(16, 0, {0, 12, 12, 24, 36, 66, 90, 120, 150, 168, 180, 168, 150, 120, 90, 66, 36, 24, 12, 12, 0}); + mask.set(17, 0, {0, 8, 8, 16, 24, 44, 60, 80, 100, 112, 120, 112, 100, 80, 60, 44, 24, 16, 8, 8, 0}); + mask.set(18, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); + mask.set(19, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); + mask.set(20, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + + // clang-format on + auto array_layouts = [](size_t w, size_t h) { + size_t vl = test::Options::vector_length(); + size_t margin = w / 2; + // two borders + one for the tail, so the NEON scalar path activates + size_t small_width = 2 * margin + 1; + // two borders + unrollonce + one for the tail + size_t medium_width = 2 * margin + vl / 4 + 1; + // two borders + unrolltwice + one for the tail + size_t big_width = 2 * margin + 2 * vl / 4 + 1; + return std::array{{ + {small_width, 2 * margin + 1, 1, 1}, + {medium_width, h, 1, 1}, + {big_width, h, 1, 1}, + }}; + }; + + GaussianBlurTest{KernelTestParams{}, array_layouts}.test(mask); +} + TYPED_TEST(GaussianBlur, 3x3_CustomSigma) { kleidicv_filter_context_t *context = nullptr; ASSERT_EQ(KLEIDICV_OK, @@ -515,6 +565,26 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType15x15) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } +TYPED_TEST(GaussianBlur, UnsupportedBorderType21x21) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 21, 21, + validSize, validSize)); + TypeParam src[1] = {}, dst[1]; + for (kleidicv_border_type_t border : { + KLEIDICV_BORDER_TYPE_CONSTANT, + KLEIDICV_BORDER_TYPE_TRANSPARENT, + KLEIDICV_BORDER_TYPE_NONE, + }) { + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, 21, 21, 0.0, 0.0, border, context)); + } + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); +} + TYPED_TEST(GaussianBlur, DifferentKernelSize) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; -- GitLab