diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c04ee50f3b7e61b5314a019ad25db1d265a2e91..240a62e8c6ec9aa302a8efbef6f2f6f7052eaa5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ This changelog aims to follow the guiding principles of ### Added - Exponential function for float. +- Gaussian Blur for 7x7 kernels. + ### Fixed ### Changed diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 380931a3cc3d334049c266735c62ee3006c343e0..67b769b19bf1975e8f57e8da8e19c8da62a5a38a 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -268,6 +268,8 @@ int gaussian_blur_binomial(const uchar *src_data, size_t src_step, impl = kleidicv_gaussian_blur_3x3_u8; } else if ((kernel_size == 5) && (width >= 5) && (height >= 5)) { impl = kleidicv_gaussian_blur_5x5_u8; + } else if ((kernel_size == 7) && (width >= 7) && (height >= 7)) { + impl = kleidicv_gaussian_blur_7x7_u8; } else { return CV_HAL_ERROR_NOT_IMPLEMENTED; } diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index b66e4d7be92eb4798798674e2259fbe8ae501937..8da44664989161c1409d0694ac2d32cc64c393ef 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -131,3 +131,44 @@ static void resize_linear_4x4_f32(benchmark::State& state) { resize_linear(kleidicv_resize_linear_f32, 4, 4, state); } BENCHMARK(resize_linear_4x4_f32); + +template +static void gaussian_blur(Function f, size_t channels, + benchmark::State& state) { + // Setup + std::vector src, dst; + src.resize(image_width * image_height * channels); + dst.resize(image_width * image_height * channels); + + std::mt19937 generator; + std::generate(src.begin(), src.end(), generator); + + kleidicv_filter_context_t* context; + kleidicv_error_t err = + kleidicv_filter_create(&context, channels, 2 * sizeof(T), + kleidicv_rectangle_t{image_width, image_height}); + if (err != KLEIDICV_OK) { + state.SkipWithError("Could not initialize Gaussian blur filter."); + return; + } + + for (auto _ : state) { + // This code gets benchmarked + auto unused = + f(src.data(), image_width, dst.data(), image_width, image_width, + image_height, channels, KLEIDICV_BORDER_TYPE_REFLECT, context); + (void)unused; + } + + (void)kleidicv_filter_release(context); +} + +static void gaussian_blur_7x7_u8_1ch(benchmark::State& state) { + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 1, state); +} +BENCHMARK(gaussian_blur_7x7_u8_1ch); + +static void gaussian_blur_7x7_u8_3ch(benchmark::State& state) { + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 3, state); +} +BENCHMARK(gaussian_blur_7x7_u8_3ch); diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index 87dd565dc9342224c33d44d309fd5a5dbf0ed25f..f56f4a9333458ab02faa09d9006def5d096dd622 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -82,6 +82,26 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 5x5, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 5x5, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 5x5, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + + TEST("Gaussian blur 7x7, BORDER_REFLECT, 1 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT, 2 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT, 3 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT, 4 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + + TEST("Gaussian blur 7x7, BORDER_WRAP, 1 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + TEST("Gaussian blur 7x7, BORDER_WRAP, 2 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + TEST("Gaussian blur 7x7, BORDER_WRAP, 3 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + TEST("Gaussian blur 7x7, BORDER_WRAP, 4 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 0edc0259ad718c8f5d1bb6b4fa4945631b56f20d..874f5442db0e4c9feec957ec1b6bf486bb8bb5e0 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -57,12 +57,12 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Transpose | x | x | x | x | ## Image filters -| | u8 | -|-----------------------|-----| -| Erode | x | -| Dilate | x | -| Sobel | x | -| Gaussian Blur | x | +| | u8 | +|-------------------------------|-----| +| Erode | x | +| Dilate | x | +| Sobel (3x3) | x | +| Gaussian Blur (3x3, 5x5, 7x7) | x | ## Resize with linear interpolation | | u8 | f32 | diff --git a/doc/opencv.md b/doc/opencv.md index 99e287c566a611e793b08bf4ef240a0ea2cc8ea9..aa687ec16f5c97b6a2e242a03125c663726d281f 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -89,7 +89,7 @@ Currently does not support non-zero margins. Kernel shape is restricted to squar Notes on parameters: * `depth` - only supports `CV_8U` depth. * `width`,`height` - Image width and height should be greater than or equal to the size of the kernel in the given direction. -* `ksize_width == ksize_height` - kernel size. Only 3x3 and 5x5 kernels are supported. +* `ksize_width == ksize_height` - kernel size. Only 3x3, 5x5 and 7x7 kernels are supported. * `border_type` - pixel extrapolation method. Supported [OpenCV border types](https://docs.opencv.org/5.x/d2/de8/group__core__array.html#ga209f2f4869e304c82d07739337eae7c5) are: + `cv::BORDER_REPLICATE` diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index 9d460027ad15b1b905dfd30786e92c3dc96aa36b..769c5480f8285ee4e63f23ebe296c0afea6eac2f 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -26,6 +26,13 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace neon namespace sve2 { @@ -44,6 +51,13 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace sve2 namespace sme2 { @@ -62,6 +76,13 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace sme2 } // namespace kleidicv diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index cb6fe639a86e42c87409ec6f955c71c7005835e2..17f3298e6e9de35a85094875fe4459b8c5c1ad01 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1031,20 +1031,23 @@ KLEIDICV_API_DECLARATION(kleidicv_canny_u8, const uint8_t *src, /// Creates a filter context according to the parameters. /// -/// Before a gaussian_blur operation, this initialization is needed. +/// Before a Gaussian blur operation, this initialization is needed. /// After the operation is finished, the context needs to be released /// using @ref kleidicv_filter_release. /// /// @param context Pointer where to return the created context's address. /// @param channels Number of channels in the data. Must be not more than /// @ref KLEIDICV_MAXIMUM_CHANNEL_COUNT. -/// @param type_size Size of buffer element in bytes. It must be double the -/// size of the type the filter operation is executed on. +/// @param intermediate_size Size of an intermediate buffer element in bytes. +/// The element must be large enough to fit values of +/// the intermediate type used internally by the +/// Gaussian blur operation. /// @param image Image dimensions. Its size must not be more than /// @ref KLEIDICV_MAX_IMAGE_PIXELS. /// kleidicv_error_t kleidicv_filter_create(kleidicv_filter_context_t **context, - size_t channels, size_t type_size, + size_t channels, + size_t intermediate_size, kleidicv_rectangle_t image); /// Releases a filter context that was previously created using @ref @@ -1071,6 +1074,16 @@ kleidicv_error_t kleidicv_filter_release(kleidicv_filter_context_t *context); /// [ 4, 16, 24, 16, 4 ] /// [ 1, 4, 6, 4, 1 ] /// ``` +/// 7x7 Gaussian Blur filter for uint8_t types: +/// ``` +/// [ 4, 14, 28, 36, 28, 14, 4 ] +/// [ 14, 49, 98, 126, 98, 49, 14 ] +/// [ 28, 98, 196, 252, 196, 98, 28 ] +/// 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] +/// [ 28, 98, 196, 252, 196, 98, 28 ] +/// [ 14, 49, 98, 126, 98, 49, 14 ] +/// [ 4, 14, 28, 36, 28, 14, 4 ] +/// ``` /// /// Width and height are the same for the source and for the destination. Number /// of elements is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. @@ -1119,6 +1132,14 @@ KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_5x5_u8, const uint8_t *src, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +/// @copydoc kleidicv_gaussian_blur_3x3_u8 +/// +KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_7x7_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + /// Splits a multi channel source stream into separate 1-channel streams. Width /// and height are the same for the source stream and for all the destination /// streams. Number of pixels is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h index 7339eb8ee10b2d9007ef2eedc4048977c339d9a5..c5a59dc1a57d0c68d26553eaf5181c821dc1c213 100644 --- a/kleidicv/include/kleidicv/neon.h +++ b/kleidicv/include/kleidicv/neon.h @@ -325,266 +325,6 @@ void apply_block_operation_by_rows(OperationType &operation, zip_rows(block_operation, std::forward(args)...); } -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr Margin margin() { return Margin{1UL}; } - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(border_offsets.c0())[index]; - auto src_1 = &src_rows.at(border_offsets.c1())[index]; - auto src_2 = &src_rows.at(border_offsets.c2())[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - SourceVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.vertical_vector_path(src_a, &dst_rows[index]); - filter_.vertical_vector_path( - src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - SourceVectorType src[3]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[3]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - BufferVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[3]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr Margin margin() { return Margin{2UL}; } - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[5]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[5]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - - BufferVectorType src_a[5], src_b[5]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[5]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - } // namespace kleidicv::neon #endif // KLEIDICV_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..ec03c40eaf8022670e5a727e06b5efff962a98db --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_3x3.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 3x3 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{1UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(border_offsets.c0())[index]; + auto src_1 = &src_rows.at(border_offsets.c1())[index]; + auto src_2 = &src_rows.at(border_offsets.c2())[index]; + + auto src_0_x2 = vld1q_x2(&src_0[0]); + auto src_1_x2 = vld1q_x2(&src_1[0]); + auto src_2_x2 = vld1q_x2(&src_2[0]); + + SourceVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.vertical_vector_path(src_a, &dst_rows[index]); + filter_.vertical_vector_path( + src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + SourceVectorType src[3]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[3]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + + auto src_0_x2 = vld1q_x2(&src_0[0]); + auto src_1_x2 = vld1q_x2(&src_1[0]); + auto src_2_x2 = vld1q_x2(&src_2[0]); + + BufferVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[3]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h new file mode 100644 index 0000000000000000000000000000000000000000..42c178b02cbbad68a5947fec57e637e76d3535a7 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -0,0 +1,163 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_SC_H +#define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_3x3.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 3x3 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{1UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, + &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_3X3_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..2694bc3bd7c46b84a6d202b581c57e093e122970 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -0,0 +1,141 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_5x5.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 5x5 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{2UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[5]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[5]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + + BufferVectorType src_a[5], src_b[5]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[5]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[5]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 5x5 separable filters driver type. +template +using SeparableFilter5x5 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h new file mode 100644 index 0000000000000000000000000000000000000000..2115c1ed021f8751e3f063408ac30bd477a56738 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_5x5_SC_H +#define KLEIDICV_SEPARABLE_FILTER_5x5_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_5x5.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 5x5 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{2UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c3())[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c4())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, + &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, + &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[5]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 5x5 separable filters driver type. +template +using SeparableFilter5x5 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_5x5_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..0b71b237ac825a1cc3a7b8fa61a92fb516c4ad83 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_7x7.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 7x7 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{3UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[7]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[7]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + src[5] = src_rows.at(border_offsets.c5())[index]; + src[6] = src_rows.at(border_offsets.c6())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + + BufferVectorType src_a[7], src_b[7]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_5[0]); + src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_6[0]); + src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[7]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[7]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 7x7 separable filters driver type. +template +using SeparableFilter7x7 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h new file mode 100644 index 0000000000000000000000000000000000000000..a19e01e3ad4336ad777eedeffd52a916f963c083 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_7x7_SC_H +#define KLEIDICV_SEPARABLE_FILTER_7x7_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_7x7.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 7x7 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{3UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c3())[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c4())[index]); + SourceVectorType src_5 = + svld1(pg, &src_rows.at(border_offsets.c5())[index]); + SourceVectorType src_6 = + svld1(pg, &src_rows.at(border_offsets.c6())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, src_0_5, src_0_6, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + BufferVectorType src_5 = + svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); + BufferVectorType src_6 = + svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[7]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 7x7 separable filters driver type. +template +using SeparableFilter7x7 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_7x7_SC_H diff --git a/kleidicv/include/kleidicv/sve2.h b/kleidicv/include/kleidicv/sve2.h index bedd2cbd68b1223a643d2326d9a6b7fa5f7b912d..8f656d7d7f6f63cd71d2f860f80ed3f862e516b0 100644 --- a/kleidicv/include/kleidicv/sve2.h +++ b/kleidicv/include/kleidicv/sve2.h @@ -500,314 +500,6 @@ void apply_operation_by_rows(OperationType &operation, zip_rows(row_based_operation, std::forward(args)...); } -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{1UL}; - } - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, - &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{2UL}; - } - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - // Swap two variables, since some C++ Standard Library implementations do not // allow using std::swap for SVE vectors. template diff --git a/kleidicv/include/kleidicv/workspace/border_3x3.h b/kleidicv/include/kleidicv/workspace/border_3x3.h new file mode 100644 index 0000000000000000000000000000000000000000..ecd5627d328811f358fb05a469a830e966a35e76 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_3x3.h @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_3X3_H +#define KLEIDICV_WORKSPACE_BORDER_3X3_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 3x3 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + Offsets() = default; + + Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + + private: + size_t offsets_[3]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const { return get(-1, 0, 1); } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(0, 0, 1); + break; + + case FixedBorderType::WRAP: + return get(height_ - 1, 0, 1); + break; + + case FixedBorderType::REVERSE: + return get(1, 0, 1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(-1, 0, 0); + break; + + case FixedBorderType::WRAP: + return get(-1, 0, 1 - height_); + break; + + case FixedBorderType::REVERSE: + return get(-1, 0, -1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index == 0U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index == (height_ - 1U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2) const { + return Offsets{o0, o1, o2}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 3x3 filter border type. +template +using FixedBorderInfo3x3 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_3X3_H diff --git a/kleidicv/include/kleidicv/workspace/borders.h b/kleidicv/include/kleidicv/workspace/border_5x5.h similarity index 55% rename from kleidicv/include/kleidicv/workspace/borders.h rename to kleidicv/include/kleidicv/workspace/border_5x5.h index 5e5c18be236e2dc2afc5a506365f345bc1a4687b..06c2683bd9e5d1f895ca61cc4c887b1bd324b8ce 100644 --- a/kleidicv/include/kleidicv/workspace/borders.h +++ b/kleidicv/include/kleidicv/workspace/border_5x5.h @@ -2,135 +2,18 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_WORKSPACE_BORDERS_H -#define KLEIDICV_WORKSPACE_BORDERS_H - -#include +#ifndef KLEIDICV_WORKSPACE_BORDER_5X5_H +#define KLEIDICV_WORKSPACE_BORDER_5X5_H +#include "border_types.h" #include "kleidicv/kleidicv.h" namespace KLEIDICV_TARGET_NAMESPACE { -enum class FixedBorderType { - REPLICATE, - REFLECT, - WRAP, - REVERSE, -}; - -inline std::optional get_fixed_border_type( - kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type) { - case KLEIDICV_BORDER_TYPE_REPLICATE: - return FixedBorderType::REPLICATE; - case KLEIDICV_BORDER_TYPE_REFLECT: - return FixedBorderType::REFLECT; - case KLEIDICV_BORDER_TYPE_WRAP: - return FixedBorderType::WRAP; - case KLEIDICV_BORDER_TYPE_REVERSE: - return FixedBorderType::REVERSE; - default: - return std::optional(); - } -} - // Border offsets for fixed-size filters. template class FixedBorderInfo; -// Border offsets for 3x3 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - Offsets() = default; - - Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - - private: - size_t offsets_[3]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const { return get(-1, 0, 1); } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - case FixedBorderType::REFLECT: - return get(0, 0, 1); - break; - - case FixedBorderType::WRAP: - return get(height_ - 1, 0, 1); - break; - - case FixedBorderType::REVERSE: - return get(1, 0, 1); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - case FixedBorderType::REFLECT: - return get(-1, 0, 0); - break; - - case FixedBorderType::WRAP: - return get(-1, 0, 1 - height_); - break; - - case FixedBorderType::REVERSE: - return get(-1, 0, -1); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index == 0U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index == (height_ - 1U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2) const { - return Offsets{o0, o1, o2}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - // Border offsets for 5x5 filters. template class FixedBorderInfo final { @@ -270,14 +153,10 @@ class FixedBorderInfo final { FixedBorderType border_type_; }; // end of class FixedBorderInfo -// Shorthand for 3x3 filter border type. -template -using FixedBorderInfo3x3 = FixedBorderInfo; - // Shorthand for 5x5 filter border type. template using FixedBorderInfo5x5 = FixedBorderInfo; } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_WORKSPACE_BORDERS_H +#endif // KLEIDICV_WORKSPACE_BORDER_5X5_H diff --git a/kleidicv/include/kleidicv/workspace/border_7x7.h b/kleidicv/include/kleidicv/workspace/border_7x7.h new file mode 100644 index 0000000000000000000000000000000000000000..75bb86117e76e0b490c39eeebd050417ba43506f --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_7x7.h @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_7X7_H +#define KLEIDICV_WORKSPACE_BORDER_7X7_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 7x7 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) + : offsets_{o0, o1, o2, o3, o4, o5, o6} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + size_t c5() const { return offsets_[5]; } + size_t c6() const { return offsets_[6]; } + + private: + size_t offsets_[7]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-3, -2, -1, 0, 1, 2, 3); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(-1, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(2, 1, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(0, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(height_ - 3, height_ - 2, -1, 0, 1, 2, 3); + } else { + return get(height_ - 3, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(3, 2, 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(1, 0, -1, 0, 1, 2, 3); + } else { + return get(-1, -2, -1, 0, 1, 2, 3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 1); + } else { + return get(-3, -2, -1, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 0); + } else { + return get(-3, -2, -1, 0, 0, -1, -2); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 3 - height_); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 2 - height_, 3 - height_); + } else { + return get(-3, -2, -1, 0, 1 - height_, 2 - height_, 3 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 1); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 0, -1); + } else { + return get(-3, -2, -1, 0, -1, -2, -3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 2U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 3U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 7x7 filter border type. +template +using FixedBorderInfo7x7 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_7X7_H diff --git a/kleidicv/include/kleidicv/workspace/border_types.h b/kleidicv/include/kleidicv/workspace/border_types.h new file mode 100644 index 0000000000000000000000000000000000000000..0825bc372a5876c8691a065e346c4de85a8fe1e3 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_types.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_TYPES_H +#define KLEIDICV_WORKSPACE_BORDER_TYPES_H + +#include + +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +enum class FixedBorderType { + REPLICATE, + REFLECT, + WRAP, + REVERSE, +}; + +inline std::optional get_fixed_border_type( + kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type) { + case KLEIDICV_BORDER_TYPE_REPLICATE: + return FixedBorderType::REPLICATE; + case KLEIDICV_BORDER_TYPE_REFLECT: + return FixedBorderType::REFLECT; + case KLEIDICV_BORDER_TYPE_WRAP: + return FixedBorderType::WRAP; + case KLEIDICV_BORDER_TYPE_REVERSE: + return FixedBorderType::REVERSE; + default: + return std::optional(); + } +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_TYPES_H diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 3cf1671b996866425ee71a3bafb0ced1c9967958..b8d036a0791c78c455665805374924480eac9437 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -8,7 +8,7 @@ #include #include -#include "borders.h" +#include "border_types.h" #include "kleidicv/kleidicv.h" #include "kleidicv/types.h" @@ -81,8 +81,9 @@ class SeparableFilterWorkspace final { // Creates a workspace on the heap. static Pointer create(Rectangle rect, size_t channels, - size_t buffer_type_size) KLEIDICV_STREAMING_COMPATIBLE { - size_t buffer_rows_width = buffer_type_size * rect.width(); + size_t intermediate_size) + KLEIDICV_STREAMING_COMPATIBLE { + size_t buffer_rows_width = intermediate_size * rect.width(); // Adding more elements because of SVE, where interleaving stores are // governed by one predicate. For example, if a predicate requires 7 uint8_t // elements and an algorithm performs widening to 16 bits, the resulting @@ -110,14 +111,14 @@ class SeparableFilterWorkspace final { workspace->buffer_rows_stride_ = buffer_rows_stride; workspace->image_size_ = rect; workspace->channels_ = channels; - workspace->buffer_type_size_ = buffer_type_size; + workspace->intermediate_size_ = intermediate_size; return workspace; } size_t channels() const { return channels_; } Rectangle image_size() const { return image_size_; } - size_t buffer_type_size() const { return buffer_type_size_; } + size_t intermediate_size() const { return intermediate_size_; } // Processes rows vertically first along the full width template @@ -204,7 +205,7 @@ class SeparableFilterWorkspace final { Rectangle image_size_; size_t channels_; - size_t buffer_type_size_; + size_t intermediate_size_; // Workspace area begins here. uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment); diff --git a/kleidicv/src/filters/gaussian_blur_api.cpp b/kleidicv/src/filters/gaussian_blur_api.cpp index eeb6d7f747f8456a6cbcf57abd3c1b778e1e1f82..f42da7913c6ee2e9c2413f64aaf5683fd9c3b600 100644 --- a/kleidicv/src/filters/gaussian_blur_api.cpp +++ b/kleidicv/src/filters/gaussian_blur_api.cpp @@ -13,12 +13,13 @@ using KLEIDICV_TARGET_NAMESPACE::Rectangle; using KLEIDICV_TARGET_NAMESPACE::SeparableFilterWorkspace; kleidicv_error_t kleidicv_filter_create(kleidicv_filter_context_t **context, - size_t channels, size_t type_size, + size_t channels, + size_t intermediate_size, kleidicv_rectangle_t image) { CHECK_POINTERS(context); CHECK_RECTANGLE_SIZE(image); - if (type_size > KLEIDICV_MAXIMUM_TYPE_SIZE) { + if (intermediate_size > KLEIDICV_MAXIMUM_TYPE_SIZE) { return KLEIDICV_ERROR_RANGE; } @@ -26,8 +27,8 @@ kleidicv_error_t kleidicv_filter_create(kleidicv_filter_context_t **context, return KLEIDICV_ERROR_RANGE; } - auto workspace = - SeparableFilterWorkspace::create(Rectangle{image}, channels, type_size); + auto workspace = SeparableFilterWorkspace::create(Rectangle{image}, channels, + intermediate_size); if (!workspace) { *context = nullptr; return KLEIDICV_ERROR_ALLOCATION; @@ -60,3 +61,8 @@ KLEIDICV_MULTIVERSION_C_API( kleidicv_gaussian_blur_5x5_u8, &kleidicv::neon::gaussian_blur_5x5_u8, KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_5x5_u8), &kleidicv::sme2::gaussian_blur_5x5_u8); + +KLEIDICV_MULTIVERSION_C_API( + kleidicv_gaussian_blur_7x7_u8, &kleidicv::neon::gaussian_blur_7x7_u8, + KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_7x7_u8), + &kleidicv::sme2::gaussian_blur_7x7_u8); diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index 096231d350b124ff983a527781bc57529b115e2e..33ce02f7fe0a16e65691069e980b34f1b1bdbba4 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -5,6 +5,9 @@ #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_3x3_neon.h" +#include "kleidicv/separable_filter_5x5_neon.h" +#include "kleidicv/separable_filter_7x7_neon.h" namespace kleidicv::neon { @@ -79,7 +82,6 @@ class DiscreteGaussianBlur { // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ] // [ 4, 16, 24, 16, 4 ] [ 4 ] // [ 1, 4, 6, 4, 1 ] [ 1 ] -// 5x5 Gaussian Blur filter for uint8_t types. template <> class DiscreteGaussianBlur { public: @@ -88,9 +90,9 @@ class DiscreteGaussianBlur { using DestinationType = uint8_t; DiscreteGaussianBlur() - : const_6_u8_{vmov_n_u8(6)}, - const_6_u16_{vmovq_n_u16(6)}, - const_4_u16_{vmovq_n_u16(4)} {} + : const_6_u8_half_{vdup_n_u8(6)}, + const_6_u16_{vdupq_n_u16(6)}, + const_4_u16_{vdupq_n_u16(4)} {} // Applies vertical filtering vector using SIMD operations. // @@ -100,8 +102,10 @@ class DiscreteGaussianBlur { uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4])); uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3])); uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3])); - uint16x8_t acc_l = vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_); - uint16x8_t acc_h = vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_); + uint16x8_t acc_l = + vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_); + uint16x8_t acc_h = + vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_); acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_); acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_); vst1q(&dst[0], acc_l); @@ -137,11 +141,149 @@ class DiscreteGaussianBlur { } private: - uint8x8_t const_6_u8_; + uint8x8_t const_6_u8_half_; uint16x8_t const_6_u16_; uint16x8_t const_4_u16_; }; // end of class DiscreteGaussianBlur +// Template for 7x7 Gaussian Blur approximation filters. +// +// [ 4, 14, 28, 36, 28, 14, 4 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 28, 98, 196, 252, 196, 98, 28 ] +// F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] = +// [ 28, 98, 196, 252, 196, 98, 28 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 4, 14, 28, 36, 28, 14, 4 ] +// +// [ 2 ] +// [ 7 ] +// [ 14 ] +// = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ] +// [ 14 ] +// [ 7 ] +// [ 2 ] +template <> +class DiscreteGaussianBlur { + public: + using SourceType = uint8_t; + using BufferType = uint16_t; + using DestinationType = uint8_t; + + DiscreteGaussianBlur() + : const_7_u16_{vdupq_n_u16(7)}, + const_7_u32_{vdupq_n_u32(7)}, + const_9_u16_{vdupq_n_u16(9)} {} + + // Applies vertical filtering vector using SIMD operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const { + uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6])); + uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6])); + + uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5])); + uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5])); + + uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4])); + uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4])); + + uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3])); + uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3])); + + uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_); + uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_); + + uint16x8_t acc_0_2_3_4_6_l = + vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_); + uint16x8_t acc_0_2_3_4_6_h = + vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_); + + acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1); + acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1); + + uint16x8_t acc_0_1_2_3_4_5_6_l = + vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_); + uint16x8_t acc_0_1_2_3_4_5_6_h = + vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_); + + vst1q(&dst[0], acc_0_1_2_3_4_5_6_l); + vst1q(&dst[8], acc_0_1_2_3_4_5_6_h); + } + + // Applies vertical filtering vector using scalar operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void vertical_scalar_path(const SourceType src[7], BufferType *dst) const { + uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + src[4] * 14 + src[5] * 7 + src[6] * 2; + dst[0] = acc; + } + + // Applies horizontal filtering vector using SIMD operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const { + uint32x4_t acc_0_6_l = + vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6])); + uint32x4_t acc_0_6_h = + vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6])); + + uint32x4_t acc_1_5_l = + vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5])); + uint32x4_t acc_1_5_h = + vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5])); + + uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]); + + uint32x4_t acc_0_2_4_6_l = + vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_)); + uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4), + vget_high_u16(const_7_u16_)); + + uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]), + vget_low_u16(const_9_u16_)); + uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]), + vget_high_u16(const_9_u16_)); + + acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1); + acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1); + + uint32x4_t acc_0_1_2_3_4_5_6_l = + vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_); + uint32x4_t acc_0_1_2_3_4_5_6_h = + vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_); + + uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12); + uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12); + + uint16x8_t acc_0_1_2_3_4_5_6_u16 = + vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h); + uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16); + + vst1(&dst[0], acc_0_1_2_3_4_5_6_u8); + } + + // Applies horizontal filtering vector using scalar operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_scalar_path(const BufferType src[7], + DestinationType *dst) const { + uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + src[4] * 14 + src[5] * 7 + src[6] * 2; + dst[0] = rounding_shift_right(acc, 12); + } + + private: + uint16x8_t const_7_u16_; + uint32x4_t const_7_u32_; + uint16x8_t const_9_u16_; +}; // end of class DiscreteGaussianBlur + template kleidicv_error_t discrete_gaussian_blur(const ScalarType *src, size_t src_stride, ScalarType *dst, @@ -170,7 +312,7 @@ kleidicv_error_t discrete_gaussian_blur(const ScalarType *src, auto *workspace = reinterpret_cast(context); - if (workspace->buffer_type_size() != 2 * sizeof(ScalarType)) { + if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } @@ -219,4 +361,16 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); +} + } // namespace kleidicv::neon diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 491bd454ea2780fc012a35d1d6b849770cfb9ce3..5f5127652fde0b72ef0521cdc574200aa322b87f 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -8,6 +8,9 @@ #include #include "kleidicv/kleidicv.h" +#include "kleidicv/separable_filter_3x3_sc.h" +#include "kleidicv/separable_filter_5x5_sc.h" +#include "kleidicv/separable_filter_7x7_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -137,6 +140,129 @@ class DiscreteGaussianBlur { } }; // end of class DiscreteGaussianBlur +// Template for 7x7 Gaussian Blur approximation filters. +// +// [ 4, 14, 28, 36, 28, 14, 4 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 28, 98, 196, 252, 196, 98, 28 ] +// F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] = +// [ 28, 98, 196, 252, 196, 98, 28 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 4, 14, 28, 36, 28, 14, 4 ] +// +// [ 2 ] +// [ 7 ] +// [ 14 ] +// = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ] +// [ 14 ] +// [ 7 ] +// [ 2 ] +template <> +class DiscreteGaussianBlur { + public: + using SourceType = uint8_t; + using BufferType = uint16_t; + using DestinationType = uint8_t; + + // Applies vertical filtering vector using SIMD operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void vertical_vector_path( + svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, + svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t const_7_u16 = svdup_n_u16(7); + svuint16_t const_9_u16 = svdup_n_u16(9); + + svuint16_t acc_0_6_b = svaddlb_u16(src_0, src_6); + svuint16_t acc_0_6_t = svaddlt_u16(src_0, src_6); + + svuint16_t acc_1_5_b = svaddlb_u16(src_1, src_5); + svuint16_t acc_1_5_t = svaddlt_u16(src_1, src_5); + + svuint16_t acc_2_4_b = svaddlb_u16(src_2, src_4); + svuint16_t acc_2_4_t = svaddlt_u16(src_2, src_4); + + svuint16_t acc_3_b = svmovlb_u16(src_3); + svuint16_t acc_3_t = svmovlt_u16(src_3); + + svuint16_t acc_0_2_4_6_b = + svmla_u16_x(pg, acc_0_6_b, acc_2_4_b, const_7_u16); + svuint16_t acc_0_2_4_6_t = + svmla_u16_x(pg, acc_0_6_t, acc_2_4_t, const_7_u16); + + svuint16_t acc_0_2_3_4_6_b = + svmla_u16_x(pg, acc_0_2_4_6_b, acc_3_b, const_9_u16); + svuint16_t acc_0_2_3_4_6_t = + svmla_u16_x(pg, acc_0_2_4_6_t, acc_3_t, const_9_u16); + acc_0_2_3_4_6_b = svlsl_n_u16_x(pg, acc_0_2_3_4_6_b, 1); + acc_0_2_3_4_6_t = svlsl_n_u16_x(pg, acc_0_2_3_4_6_t, 1); + + svuint16_t acc_0_1_2_3_4_5_6_b = + svmla_u16_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, const_7_u16); + svuint16_t acc_0_1_2_3_4_5_6_t = + svmla_u16_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, const_7_u16); + + svuint16x2_t interleaved = + svcreate2(acc_0_1_2_3_4_5_6_b, acc_0_1_2_3_4_5_6_t); + svst2(pg, &dst[0], interleaved); + } + + // Applies horizontal filtering vector using SIMD operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_vector_path( + svbool_t pg, svuint16_t src_0, svuint16_t src_1, svuint16_t src_2, + svuint16_t src_3, svuint16_t src_4, svuint16_t src_5, svuint16_t src_6, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t const_7_u16 = svdup_n_u16(7); + svuint16_t const_9_u16 = svdup_n_u16(9); + svuint32_t const_7_u32 = svdup_n_u32(7); + + svuint32_t acc_0_6_b = svaddlb_u32(src_0, src_6); + svuint32_t acc_0_6_t = svaddlt_u32(src_0, src_6); + + svuint32_t acc_1_5_b = svaddlb_u32(src_1, src_5); + svuint32_t acc_1_5_t = svaddlt_u32(src_1, src_5); + + svuint16_t acc_2_4 = svadd_u16_x(pg, src_2, src_4); + + svuint32_t acc_0_2_4_6_b = svmlalb_u32(acc_0_6_b, acc_2_4, const_7_u16); + svuint32_t acc_0_2_4_6_t = svmlalt_u32(acc_0_6_t, acc_2_4, const_7_u16); + + svuint32_t acc_0_2_3_4_6_b = svmlalb_u32(acc_0_2_4_6_b, src_3, const_9_u16); + svuint32_t acc_0_2_3_4_6_t = svmlalt_u32(acc_0_2_4_6_t, src_3, const_9_u16); + + acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1); + acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1); + + svuint32_t acc_0_1_2_3_4_5_6_b = + svmla_u32_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, const_7_u32); + svuint32_t acc_0_1_2_3_4_5_6_t = + svmla_u32_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, const_7_u32); + + svuint16_t acc_0_1_2_3_4_5_6_u16_b = + svrshrnb_n_u32(acc_0_1_2_3_4_5_6_b, 12); + svuint16_t acc_0_1_2_3_4_5_6_u16 = + svrshrnt_n_u32(acc_0_1_2_3_4_5_6_u16_b, acc_0_1_2_3_4_5_6_t, 12); + + svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16); + } + + // Applies horizontal filtering vector using scalar operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_scalar_path(const BufferType src[7], DestinationType *dst) + const KLEIDICV_STREAMING_COMPATIBLE { + uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + src[4] * 14 + src[5] * 7 + src[6] * 2; + dst[0] = rounding_shift_right(acc, 12); + } +}; // end of class DiscreteGaussianBlur + template kleidicv_error_t discrete_gaussian_blur( const ScalarType *src, size_t src_stride, ScalarType *dst, @@ -164,7 +290,7 @@ kleidicv_error_t discrete_gaussian_blur( auto *workspace = reinterpret_cast(context); - if (workspace->buffer_type_size() != 2 * sizeof(ScalarType)) { + if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } diff --git a/kleidicv/src/filters/gaussian_blur_sme2.cpp b/kleidicv/src/filters/gaussian_blur_sme2.cpp index 9f692b3c0e98f35f86b762d501165ab431a11875..33a2dd09da101e40b79c34443df3f3dba55cd11a 100644 --- a/kleidicv/src/filters/gaussian_blur_sme2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sme2.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -27,4 +27,14 @@ gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, border_type, context); } +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t channels, kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); +} + } // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_sve2.cpp b/kleidicv/src/filters/gaussian_blur_sve2.cpp index 12c7176de60653a03159bb2a726055165448d575..7ae808d33e7364f84996fae951a4f7c41eee1e9a 100644 --- a/kleidicv/src/filters/gaussian_blur_sve2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sve2.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -31,4 +31,16 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); +} + } // namespace kleidicv::sve2 diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 51a419392d5d5c41a023c6cd74ff801bd7a1407c..869e3478812e2f031ddfff2a7c9f2e05be3de526 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -6,6 +6,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_3x3_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 5671232fbdaace3e9830ed0e95616287027e2a36..6da9236942a90db0694db3273bcb164fab045fdc 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -7,6 +7,7 @@ #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" +#include "kleidicv/separable_filter_3x3_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 12ab7c7ab653d0fed1a3bf62ce9bd0483c761398..da9040dbb207735cebabc3b9476933424c32e04a 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -17,6 +17,7 @@ KLEIDICV_GAUSSIAN_BLUR(uint8_t, 3x3, u8); KLEIDICV_GAUSSIAN_BLUR(uint8_t, 5x5, u8); +KLEIDICV_GAUSSIAN_BLUR(uint8_t, 7x7, u8); // Implements KernelTestParams for Gaussian Blur operators. template @@ -25,7 +26,7 @@ struct GaussianBlurKernelTestParams; template struct GaussianBlurKernelTestParams { using InputType = uint8_t; - using IntermediateType = uint16_t; + using IntermediateType = uint32_t; using OutputType = uint8_t; static constexpr size_t kKernelSize = KernelSize; @@ -102,13 +103,16 @@ class GaussianBlurTest : public test::KernelTest { test::Array2D *output, kleidicv_border_type_t border_type, kleidicv_border_values_t) override { - auto api = KernelTestParams::kKernelSize == 3 - ? gaussian_blur_3x3() - : gaussian_blur_5x5(); + // NOLINTBEGIN(readability-avoid-nested-conditional-operator) + auto api = + KernelTestParams::kKernelSize == 3 ? gaussian_blur_3x3() + : KernelTestParams::kKernelSize == 5 ? gaussian_blur_5x5() + : gaussian_blur_7x7(); + // NOLINTEND(readability-avoid-nested-conditional-operator) kleidicv_filter_context_t *context = nullptr; auto ret = kleidicv_filter_create( - &context, input->channels(), sizeof(IntermediateType), + &context, input->channels(), 2 * sizeof(InputType), kleidicv_rectangle_t{input->width() / input->channels(), input->height()}); if (ret != KLEIDICV_OK) { @@ -129,9 +133,13 @@ class GaussianBlurTest : public test::KernelTest { // Apply rounding to nearest integer division. IntermediateType scale_result(const test::Kernel &kernel, IntermediateType result) override { - return kernel.width() == 3 ? ((result + 8) / 16) : ((result + 128) / 256); + // NOLINTBEGIN(readability-avoid-nested-conditional-operator) + return kernel.width() == 3 ? ((result + 8) / 16) + : kernel.width() == 5 ? ((result + 128) / 256) + : ((result + 2048) / 4096); + // NOLINTEND(readability-avoid-nested-conditional-operator) } -}; // end of class class GaussianBlur3x3Test +}; // end of class GaussianBlurTest using ElementTypes = ::testing::Types; @@ -190,6 +198,25 @@ TYPED_TEST(GaussianBlur, 5x5) { .test(mask); } +// Tests gaussian_blur_7x7_ API. +TYPED_TEST(GaussianBlur, 7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + // 7x7 GaussianBlur operator. + test::Array2D mask{7, 7}; + // clang-format off + mask.set(0, 0, { 4, 14, 28, 36, 28, 14, 4 }); + mask.set(1, 0, { 14, 49, 98, 126, 98, 49, 14 }); + mask.set(2, 0, { 28, 98, 196, 252, 196, 98, 28 }); + mask.set(3, 0, { 36, 126, 252, 324, 252, 126, 36 }); + mask.set(4, 0, { 28, 98, 196, 252, 196, 98, 28 }); + mask.set(5, 0, { 14, 49, 98, 126, 98, 49, 14 }); + mask.set(6, 0, { 4, 14, 28, 36, 28, 14, 4 }); + // clang-format on + GaussianBlurTest{} + .with_border_types(make_generator_ptr(kAllBorders)) + .test(mask); +} + TYPED_TEST(GaussianBlur, UnsupportedBorderType3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; @@ -232,9 +259,31 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType5x5) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, UnsupportedBorderType7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + TypeParam src[1] = {}, dst[1]; + for (kleidicv_border_type_t border : { + KLEIDICV_BORDER_TYPE_CONSTANT, + KLEIDICV_BORDER_TYPE_TRANSPARENT, + KLEIDICV_BORDER_TYPE_NONE, + }) { + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()(src, sizeof(TypeParam), dst, + sizeof(TypeParam), validSize, + validSize, 1, border, context)); + } + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, NullPointer) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; ASSERT_EQ(KLEIDICV_OK, @@ -248,6 +297,10 @@ TYPED_TEST(GaussianBlur, NullPointer) { test::test_null_args(gaussian_blur_5x5(), src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context); + validSize = KernelTestParams7x7::kKernelSize - 1; + test::test_null_args(gaussian_blur_7x7(), src, sizeof(TypeParam), + dst, sizeof(TypeParam), validSize, validSize, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, context); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -258,6 +311,7 @@ TYPED_TEST(GaussianBlur, Misalignment) { } using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; ASSERT_EQ(KLEIDICV_OK, @@ -281,6 +335,15 @@ TYPED_TEST(GaussianBlur, Misalignment) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + gaussian_blur_7x7()( + src, sizeof(TypeParam) + 1, dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -328,6 +391,28 @@ TYPED_TEST(GaussianBlur, ZeroImageSize5x5) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, ZeroImageSize7x7) { + TypeParam src[1] = {}, dst[1]; + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{0, 1})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 0, 1, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{1, 0})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 1, 0, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, ValidImageSize3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; @@ -372,16 +457,41 @@ TYPED_TEST(GaussianBlur, ValidImageSize5x5) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, ValidImageSize7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + test::Array2D src{validSize, validSize, + test::Options::vector_length()}; + src.set(0, 0, {1, 2, 3, 4, 5, 6}); + src.set(1, 0, {1, 2, 3, 4, 5, 6}); + src.set(2, 0, {1, 2, 3, 4, 5, 6}); + src.set(3, 0, {1, 2, 3, 4, 5, 6}); + src.set(4, 0, {1, 2, 3, 4, 5, 6}); + src.set(5, 0, {1, 2, 3, 4, 5, 6}); + + test::Array2D dst{validSize, validSize, + test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, + gaussian_blur_7x7()( + src.data(), src.stride(), dst.data(), dst.stride(), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REVERSE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, UndersizeImage3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t underSize = KernelTestParams::kKernelSize - 2; size_t validWidth = KernelTestParams::kKernelSize + 10; size_t validHeight = KernelTestParams::kKernelSize + 5; - TypeParam src[1], dst[1]; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), kleidicv_rectangle_t{underSize, underSize})); + TypeParam src[1] = {}, dst[1]; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, gaussian_blur_3x3()( src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, @@ -409,9 +519,8 @@ TYPED_TEST(GaussianBlur, UndersizeImage5x5) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t underSize = KernelTestParams::kKernelSize - 2; - size_t width = KernelTestParams::kKernelSize + 8; - size_t height = KernelTestParams::kKernelSize + 3; - + size_t validWidth = KernelTestParams::kKernelSize + 8; + size_t validHeight = KernelTestParams::kKernelSize + 3; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), kleidicv_rectangle_t{underSize, underSize})); @@ -421,20 +530,53 @@ TYPED_TEST(GaussianBlur, UndersizeImage5x5) { src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), - kleidicv_rectangle_t{underSize, height})); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{underSize, validHeight})); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, - height, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + validHeight, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validWidth, underSize})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_5x5()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validWidth, + underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + +TYPED_TEST(GaussianBlur, UndersizeImage7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t underSize = KernelTestParams::kKernelSize - 2; + size_t validWidth = KernelTestParams::kKernelSize + 6; + size_t validHeight = KernelTestParams::kKernelSize + 1; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), - kleidicv_rectangle_t{width, underSize})); + kleidicv_rectangle_t{underSize, underSize})); + TypeParam src[1] = {}, dst[1]; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur_5x5()( - src, sizeof(TypeParam), dst, sizeof(TypeParam), width, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, + underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{underSize, validHeight})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, + validHeight, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validWidth, underSize})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validWidth, underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -465,12 +607,23 @@ TYPED_TEST(GaussianBlur, OversizeImage) { src, sizeof(TypeParam), dst, sizeof(TypeParam), KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, ChannelNumber) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -490,12 +643,20 @@ TYPED_TEST(GaussianBlur, ChannelNumber) { src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, KLEIDICV_MAXIMUM_CHANNEL_COUNT + 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, KLEIDICV_MAXIMUM_CHANNEL_COUNT + 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextSizeType) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -512,12 +673,18 @@ TYPED_TEST(GaussianBlur, InvalidContextSizeType) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -535,12 +702,19 @@ TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextImageSize) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -558,6 +732,12 @@ TYPED_TEST(GaussianBlur, InvalidContextImageSize) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, + validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); }