diff --git a/CHANGELOG.md b/CHANGELOG.md index 2584beee116f11cedd3852eec3d16a6ebea68e4f..67dd6e2249d6b926129c58a265661b6d4608c5b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ This changelog aims to follow the guiding principles of ### Added - Median Blur for 3x3 kernels. - Median Blur for generic kernels (odd-sized only, max kernel size 255x255), Neon backend only. +- Gaussian Blur for any odd kernel size (up to 255x255) with replicated borders ### Changed - Performance of Gaussian Blur is greatly improved in return for some accuracy. diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 113beb6c12bccb4c1bd52f4b03a5efabc848cd64..953e924dd87e479df5e13c6c725dbe5b16a31f62 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -598,9 +598,11 @@ int gaussian_blur_binomial(const uchar *src_data, size_t src_step, } // Check for not-implemented before allocating a context - if (!kleidicv::gaussian_blur_is_implemented(width, height, kernel_size, - kernel_size, 0, 0) || - !kleidicv::get_fixed_border_type(kleidicv_border_type)) { + auto fixed_border_type = + kleidicv::get_fixed_border_type(kleidicv_border_type); + if (!fixed_border_type || !kleidicv::gaussian_blur_is_implemented( + width, height, kernel_size, kernel_size, 0, 0, + cn, *fixed_border_type)) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } @@ -651,9 +653,11 @@ int gaussian_blur(const uchar *src_data, size_t src_step, uchar *dst_data, } // Check for not-implemented before allocating a context - if (!kleidicv::gaussian_blur_is_implemented( - width, height, kernel_width, kernel_height, sigma_x, sigma_y) || - !kleidicv::get_fixed_border_type(kleidicv_border_type)) { + auto fixed_border_type = + kleidicv::get_fixed_border_type(kleidicv_border_type); + if (!fixed_border_type || !kleidicv::gaussian_blur_is_implemented( + width, height, kernel_width, kernel_height, + sigma_x, sigma_y, cn, *fixed_border_type)) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index ee72a405c24a5cb37adbe135cbe67ed7c7e9e3a8..226f7e4fa2ff6185c974c17fd3bbfffb705ffbd5 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include #include "tests.h" @@ -27,12 +28,8 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { cv::RNG rng(0); - size_t size_min = 5; - size_t size_max = 16; - if constexpr (KernelSize >= 15) { - size_min = KernelSize - 1; - size_max = 2 * KernelSize + 2; - } + size_t size_min = std::max(4, KernelSize - 1); + size_t size_max = std::max(16, 2 * KernelSize + 2); for (size_t y = size_min; y <= size_max; ++y) { for (size_t x = size_min; x <= size_max; ++x) { @@ -62,7 +59,7 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, // For bigger kernels, and for all the CustomSigma variants, a small // difference is allowed. if constexpr (KernelSize > 7 || !Binomial) { - threshold = 2; + threshold = 1; } if (are_matrices_different(threshold, actual, expected)) { @@ -203,6 +200,12 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 21x21, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 21x21, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 21x21, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + + // Generic kernel size + TEST("Gaussian blur 9x9, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 9x9, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 9x9, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 9x9, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 984e784e0d005b40455e4caff0cdaa3101406985..3c8b9a86c6172748dc1ec9fc8a77d68b3ef96164 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -72,15 +72,16 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Rotate (90 degrees clockwise) | x | x | x | x | ## Image filters -| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | -|---------------------------------------------|-----|-----|-----|-----|-----|-----|-----| -| Erode | | x | | | | | | -| Dilate | | x | | | | | | -| Sobel (3x3) | | x | | | | | | -| Separable Filter 2D (5x5) | | x | x | x | | | | -| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | -| Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | -| Median Blur (generic imp, max size 255x255) | | x | | | | | | +| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | +|---------------------------------------------------|-----|-----|-----|-----|-----|-----|-----| +| Erode | | x | | | | | | +| Dilate | | x | | | | | | +| Sobel (3x3) | | x | | | | | | +| Separable Filter 2D (5x5) | | x | x | x | | | | +| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | +| Gaussian Blur any kernel size, Replicated Borders | | x | | | | | | +| Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | +| Median Blur (generic imp, max size 255x255) | | x | | | | | | ## Resize to quarter diff --git a/doc/opencv.md b/doc/opencv.md index 9672b1e58f043ab281860b857d6d700b88657c79..a581a0cb9a7e26bfe5a5eca39fe94dc82b7b1066 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -108,8 +108,8 @@ In-place filtering is not supported i.e. `src` and `dst` must be different (non- Notes on parameters: * `src.depth()` - only supports `CV_8U` depth. * `src.cols`,`src.rows` - image width and height must be greater than or equal to `ksize - 1` -* `ksize` - supported kernel sizes are 3x3, 5x5, 7x7, 15x15 and 21x21. -* `sigmaX`, `sigmaY` - optimal performance is achieved if these are set to 0. +* `ksize` - supported kernel sizes are 3x3, 5x5, 7x7, 15x15 and 21x21. Odd kernel sizes between 9 and 255 only supported with `cv::BORDER_REPLICATE`. +* `sigmaX`, `sigmaY` - If set to 0, it is automatically calculated. Up to 7x7 kernel size optimal performance is achieved this way. * `borderType` - supported [OpenCV border types](https://docs.opencv.org/4.11.0/d2/de8/group__core__array.html#ga209f2f4869e304c82d07739337eae7c5) are: + `cv::BORDER_REPLICATE` + `cv::BORDER_REFLECT` diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index 68e5e1d3c24a91e718bc3bca18c73ea1668d6205..c3c07104090ccff5824efa668ed3bb69b3b34958 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -8,31 +8,51 @@ #include "kleidicv/config.h" #include "kleidicv/kleidicv.h" #include "kleidicv/types.h" +#include "kleidicv/utils.h" #include "kleidicv/workspace/border_types.h" +#include "kleidicv/workspace/separable.h" extern "C" { // For internal use only. See instead kleidicv_gaussian_blur_u8. // Blur a horizontal stripe across an image. The stripe is defined by the // range (y_begin, y_end]. -KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_stripe_u8, const uint8_t *src, - size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height, size_t y_begin, - size_t y_end, size_t channels, size_t kernel_width, - size_t kernel_height, float sigma_x, float sigma_y, +KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_fixed_stripe_u8, + const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, + float sigma_x, float sigma_y, + kleidicv::FixedBorderType border_type, + kleidicv_filter_context_t *context); + +KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_arbitrary_stripe_u8, + const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, + float sigma_x, float sigma_y, kleidicv::FixedBorderType border_type, kleidicv_filter_context_t *context); } namespace kleidicv { -inline bool gaussian_blur_is_implemented(size_t width, size_t height, - size_t kernel_width, - size_t kernel_height, float sigma_x, - float sigma_y) { +inline bool gaussian_blur_is_implemented( + size_t width, size_t height, size_t kernel_width, size_t kernel_height, + float sigma_x, float sigma_y, size_t channels, + kleidicv::FixedBorderType border_type) { if (kernel_width != kernel_height) { return false; } + if (kernel_width < 3 || kernel_width > 255) { + return false; + } + + if ((kernel_width & 1) != 1) { + return false; + } + if (sigma_x != sigma_y) { return false; } @@ -41,23 +61,64 @@ inline bool gaussian_blur_is_implemented(size_t width, size_t height, return false; } - switch (kernel_width) { - case 3: - case 5: - case 7: - case 15: - case 21: - break; - default: + if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) { + return false; + } + + if (kernel_width > 7 && kernel_width != 15 && kernel_width != 21) { + if (border_type != FixedBorderType::REPLICATE) { return false; + } + + size_t margin = kernel_width / 2; + // Number of 16bit elements in a 128-bit vector + size_t max_border_length = 8; + size_t aligned_margin = (margin + max_border_length - 1) / + max_border_length * max_border_length; + if (width < aligned_margin + margin) { + return false; + } } return true; } +// Does not include checks for whether the operation is implemented. +// This must be done earlier, by gaussian_blur_is_implemented. +template +kleidicv_error_t gaussian_blur_checks( + const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, + size_t height, size_t channels, + const KLEIDICV_TARGET_NAMESPACE::SeparableFilterWorkspace *workspace) + KLEIDICV_STREAMING_COMPATIBLE { + CHECK_POINTERS(workspace); + + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + + if (workspace->channels() < channels) { + return KLEIDICV_ERROR_CONTEXT_MISMATCH; + } + + const KLEIDICV_TARGET_NAMESPACE::Rectangle &context_rect = + workspace->image_size(); + if (context_rect.width() < width || context_rect.height() < height) { + return KLEIDICV_ERROR_CONTEXT_MISMATCH; + } + + return KLEIDICV_OK; +} + namespace neon { -kleidicv_error_t gaussian_blur_stripe_u8( +kleidicv_error_t gaussian_blur_fixed_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, + FixedBorderType border_type, kleidicv_filter_context_t *context); + +kleidicv_error_t gaussian_blur_arbitrary_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, @@ -67,7 +128,7 @@ kleidicv_error_t gaussian_blur_stripe_u8( namespace sve2 { -kleidicv_error_t gaussian_blur_stripe_u8( +kleidicv_error_t gaussian_blur_fixed_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, @@ -77,7 +138,7 @@ kleidicv_error_t gaussian_blur_stripe_u8( namespace sme2 { -kleidicv_error_t gaussian_blur_stripe_u8( +kleidicv_error_t gaussian_blur_fixed_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, diff --git a/kleidicv/include/kleidicv/filters/sigma.h b/kleidicv/include/kleidicv/filters/sigma.h index 596ff0e28198c740ae4330be19b461168cf79a97..87ae9d2e83cc2d979f03f606952d757f40c9d2cb 100644 --- a/kleidicv/include/kleidicv/filters/sigma.h +++ b/kleidicv/include/kleidicv/filters/sigma.h @@ -5,8 +5,9 @@ #ifndef KLEIDICV_SIGMA_H #define KLEIDICV_SIGMA_H -#include #include +#include +#include #include "kleidicv/config.h" @@ -21,22 +22,21 @@ static constexpr size_t get_half_kernel_size(size_t kernel_size) // This function is not marked as streaming compatible, as std::round is also // not streaming compatible. -template -static std::array generate_gaussian_half_kernel( - float sigma) { +static void generate_gaussian_half_kernel(uint16_t* half_kernel, + size_t half_size, float sigma) { // Define the mid point of the full kernel range. - constexpr size_t kMid = HalfKernelSize - 1; + const size_t kMid = half_size - 1; // Define the full kernel size. - constexpr size_t KernelSize = kMid * 2 + 1; + const size_t kKernelSize = kMid * 2 + 1; // Calculate the sigma manually in case it is not defined. if (sigma == 0.0) { - sigma = static_cast(KernelSize) * 0.15 + 0.35; + sigma = static_cast(kKernelSize) * 0.15F + 0.35F; } // Temporary float half-kernel. - std::array half_kernel_float{}; + float half_kernel_float[255]; // Prepare the sigma value for later multiplication inside a loop. float coefficient = 1 / -(2 * sigma * sigma); @@ -59,9 +59,6 @@ static std::array generate_gaussian_half_kernel( // by 256. float multiplier = 256 / (sum * 2 + 1); - // Result half-kernel - std::array half_kernel{}; - // Normalize the kernel and convert it to the fixed-point format. Rounding // errors are diffused in the kernel. float error = 0.0; @@ -72,8 +69,6 @@ static std::array generate_gaussian_half_kernel( error = value_rounded - value; } half_kernel[kMid] = static_cast(std::round(multiplier - error)); - - return half_kernel; } } // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 257470a2ffb0e78bc3a9e9a258b7adaf407eaddd..5e4b1edaa33dad8166202fe423e30116e835cf34 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1335,7 +1335,9 @@ kleidicv_error_t kleidicv_separable_filter_2d_s16( /// sizeof(type) * channels`, except for single-row images. /// @param width Number of columns in the data. (One column consists of /// `channels` number of elements.) Must be greater than -/// or equal to `kernel_width - 1`. +/// or equal to `kernel_width - 1` (if kernel_width is +/// 3,5,7,15,21), or `(kernel_width/2) rounded up to 8, +/// plus kernel_width/2` (for other kernel sizes). /// @param height Number of rows in the data. Must be greater than /// or equal to `kernel_height - 1`. /// @param channels Number of channels in the data. Must be not more than diff --git a/kleidicv/include/kleidicv/workspace/border_types.h b/kleidicv/include/kleidicv/workspace/border_types.h index 14c729735bc543b5888bf886f663f7de4723d05d..4803493fb79e60a8dd812064ddb419d98aef7e14 100644 --- a/kleidicv/include/kleidicv/workspace/border_types.h +++ b/kleidicv/include/kleidicv/workspace/border_types.h @@ -7,7 +7,7 @@ #include -#include "kleidicv/kleidicv.h" +#include "kleidicv/ctypes.h" namespace kleidicv { diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 8e303473101b18e5034f09f3fb9344d611da3155..f0564cd833d33f676aadc1c3432f8e2fcd366c9a 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -5,11 +5,11 @@ #ifndef KLEIDICV_WORKSPACE_SEPARABLE_H #define KLEIDICV_WORKSPACE_SEPARABLE_H +#include +#include #include #include -#include "border_types.h" -#include "kleidicv/kleidicv.h" #include "kleidicv/types.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -155,6 +155,49 @@ class SeparableFilterWorkspace { } } + // Processes rows vertically first along the full width + template + void process_arbitrary(Rectangle rect, size_t kernel_size, size_t y_begin, + size_t y_end, + Rows src_rows, + Rows dst_rows, + size_t channels, + typename FilterType::BorderType /* border_type */, + FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + // Buffer rows which hold intermediate widened data. + auto buffer_rows = Rows{reinterpret_cast( + &data_[buffer_rows_offset_]), + buffer_rows_stride_, channels}; + size_t margin = kernel_size / 2; + + // Process top rows, affected by border + for (size_t row_index = y_begin; row_index < std::max(y_begin, margin); + ++row_index) { + filter.process_arbitrary_border_vertical(rect.width(), src_rows, + row_index, buffer_rows); + filter.process_arbitrary_horizontal(rect.width(), kernel_size, + buffer_rows, dst_rows.at(row_index)); + } + + // Process middle rows that are not affected by any borders + for (size_t row_index = std::max(y_begin, margin); + row_index < std::min(y_end, rect.height() - margin); ++row_index) { + filter.process_arbitrary_vertical(rect.width(), src_rows.at(row_index), + buffer_rows); + filter.process_arbitrary_horizontal(rect.width(), kernel_size, + buffer_rows, dst_rows.at(row_index)); + } + + // Process bottom rows, affected by border + for (size_t row_index = std::min(y_end, rect.height() - margin); + row_index < y_end; ++row_index) { + filter.process_arbitrary_border_vertical(rect.width(), src_rows, + row_index, buffer_rows); + filter.process_arbitrary_horizontal(rect.width(), kernel_size, + buffer_rows, dst_rows.at(row_index)); + } + } + protected: template void process_horizontal(size_t width, diff --git a/kleidicv/src/filters/border_generic_neon.h b/kleidicv/src/filters/border_generic_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..5837bbcfb078b5210909d61f37713eef299b4425 --- /dev/null +++ b/kleidicv/src/filters/border_generic_neon.h @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_GENERIC_NEON_H +#define KLEIDICV_WORKSPACE_BORDER_GENERIC_NEON_H + +#include +#include + +#include "kleidicv/neon.h" +#include "kleidicv/types.h" +#include "kleidicv/workspace/border_types.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for generic filters. +template +class GenericBorderHorizontal final { + public: + GenericBorderHorizontal(size_t width, size_t channels) + : width_(static_cast(width)), + channels_{static_cast(channels)}, + data_indices_{0UL | (1UL << 8) | (2UL << 16) | (3UL << 24) | + (4UL << 32) | (5UL << 40) | (6UL << 48) | (7UL << 56)}, + border_indices_left_{0}, + border_indices_right_{0} { + // The result will take some elements from the image (data), and the + // remaining parts from the border. + // An index vector is prepared here to help the process, e.g. for replicated + // borders and 3 channels, the constructed index vector will look like this: + // [1, 2, 0, 1, 2, 3, 4, 5] + // (0,1,2 is repeated until index 0 is reached, when the image data begins) + // Right side is similar, but it is the [5,6,7] that repeats after. + for (ptrdiff_t i = 0; i < 8; ++i) { + // channels_*8 - 1 - i: 23, 22, 21, 20, 19, 18, 17, 16 + // % channels: 2, 1, 0, 2, 1, 0, 2, 1 + border_indices_left_ = + (border_indices_left_ << 8) | ((channels_ * 8 - 1 - i) % channels_); + // (7 - i): 7, 6, 5, 4, 3, 2, 1, 0 + // % channels: 1, 0, 2, 1 0, 2, 1, 0 + border_indices_right_ = + (border_indices_right_ << 8) | (((7 - i) % channels) + 8 - channels_); + } + } + + // Raw column can be bigger than width-1 or less than 0 + ptrdiff_t get_column(ptrdiff_t raw_column) const { + // TODO more border types, this is only the Replicated + return std::max(std::min(raw_column, width_ - 1), + ptrdiff_t{0}); + } + + // Assuming that start_offset is <= 0 + uint16x8_t load_left(Rows src_rows, + ptrdiff_t start_offset) const { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + uint8x8_t data = vld1_u8(&src_rows[0]); + uint64_t indices{}; + if (start_offset > -8) { + ptrdiff_t shift = -8 * start_offset; + indices = + ((border_indices_left_ >> (64 - shift)) | (data_indices_ << shift)); + } else { + ptrdiff_t shift = ((-start_offset - 8) % channels_) * 8; + indices = (((border_indices_left_ >> (8 * channels_ - shift)) & + ((1 << shift) - 1)) | + (border_indices_left_ << shift)); + } + return vmovl_u8(vtbl1_u8(data, vreinterpret_u8_u64(uint64x1_t{indices}))); + } + } + + // Assuming that start_offset is >= width - 8 + uint16x8_t load_right(Rows src_rows, + ptrdiff_t start_offset) const { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + uint8x8_t data = vld1_u8(&src_rows[width_ * channels_ - 8]); + uint64_t indices{}; + ptrdiff_t shift = 8 * (start_offset - (width_ * channels_ - 8)); + if (shift < 64) { + indices = + (data_indices_ >> shift) | (border_indices_right_ << (64 - shift)); + } else { + shift = ((start_offset - width_ * channels_) % channels_) * 8; + indices = shift == 0 + ? border_indices_right_ + : (((border_indices_right_ >> (8 * channels_ - shift)) + << (64 - shift)) | + (border_indices_right_ >> shift)); + } + return vmovl_u8(vtbl1_u8(data, vreinterpret_u8_u64(uint64x1_t{indices}))); + } + } + + private: + ptrdiff_t width_; + ptrdiff_t channels_; + uint64_t data_indices_, border_indices_left_, border_indices_right_; +}; // end of class GenericBorderHorizontal + +// Border offsets for generic filters. +template +class GenericBorderVertical final { + public: + explicit GenericBorderVertical(size_t height) + : height_(static_cast(height)) {} + + // Raw column can be bigger than width-1 or less than 0 + ptrdiff_t get_row(ptrdiff_t raw_row) const { + // TODO more border types, this is only the Replicated + return std::max(std::min(raw_row, height_ - 1), + ptrdiff_t{0}); + } + + private: + ptrdiff_t height_; +}; // end of class GenericBorderVertical + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_GENERIC_NEON_H diff --git a/kleidicv/src/filters/gaussian_blur_api.cpp b/kleidicv/src/filters/gaussian_blur_api.cpp index d571480b6d7b227ad525839519cbee85e5d840bd..3a75f5c2ecc8e2cfb0dff9fe6fa3828c25a4b0b5 100644 --- a/kleidicv/src/filters/gaussian_blur_api.cpp +++ b/kleidicv/src/filters/gaussian_blur_api.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -7,9 +7,14 @@ #include "kleidicv/kleidicv.h" KLEIDICV_MULTIVERSION_C_API( - kleidicv_gaussian_blur_stripe_u8, &kleidicv::neon::gaussian_blur_stripe_u8, - KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_stripe_u8), - &kleidicv::sme2::gaussian_blur_stripe_u8); + kleidicv_gaussian_blur_fixed_stripe_u8, + &kleidicv::neon::gaussian_blur_fixed_stripe_u8, + KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_fixed_stripe_u8), + &kleidicv::sme2::gaussian_blur_fixed_stripe_u8); + +KLEIDICV_MULTIVERSION_C_API(kleidicv_gaussian_blur_arbitrary_stripe_u8, + &kleidicv::neon::gaussian_blur_arbitrary_stripe_u8, + nullptr, nullptr); extern "C" { @@ -18,20 +23,28 @@ kleidicv_error_t kleidicv_gaussian_blur_u8( size_t width, size_t height, size_t channels, size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { - if (!kleidicv::gaussian_blur_is_implemented( - width, height, kernel_width, kernel_height, sigma_x, sigma_y)) { + auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); + if (!fixed_border_type) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); - if (!fixed_border_type) { + if (!kleidicv::gaussian_blur_is_implemented(width, height, kernel_width, + kernel_height, sigma_x, sigma_y, + channels, *fixed_border_type)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - return kleidicv_gaussian_blur_stripe_u8(src, src_stride, dst, dst_stride, - width, height, 0, height, channels, - kernel_width, kernel_height, sigma_x, - sigma_y, *fixed_border_type, context); + if (kernel_width <= 7 || kernel_width == 15 || kernel_width == 21) { + return kleidicv_gaussian_blur_fixed_stripe_u8( + src, src_stride, dst, dst_stride, width, height, 0, height, channels, + kernel_width, kernel_height, sigma_x, sigma_y, *fixed_border_type, + context); + } + + return kleidicv_gaussian_blur_arbitrary_stripe_u8( + src, src_stride, dst, dst_stride, width, height, 0, height, channels, + kernel_width, kernel_height, sigma_x, sigma_y, *fixed_border_type, + context); } } // extern "C" diff --git a/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp b/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a0813395fb011119cae334f52d090b92927381b4 --- /dev/null +++ b/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp @@ -0,0 +1,450 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "border_generic_neon.h" +#include "kleidicv/config.h" +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/gaussian_blur.h" +#include "kleidicv/filters/sigma.h" +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_types.h" +#include "kleidicv/workspace/separable.h" + +namespace kleidicv::neon { + +// Template for arbitrary kernel size Gaussian Blur filters. +template +class GaussianBlurArbitrary; + +template +class GaussianBlurArbitrary { + public: + using SourceType = uint8_t; + using BufferType = uint8_t; + using DestinationType = uint8_t; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderType = FixedBorderType; + + GaussianBlurArbitrary(const uint16_t *half_kernel, ptrdiff_t half_kernel_size, + Rectangle &rect, size_t channels) + : half_kernel_size_(half_kernel_size), + half_kernel_u16_(half_kernel), + width_(static_cast(rect.width())), + vertical_border_(rect.height()), + horizontal_border_(rect.width(), channels) {} + + // Not border-affected parts + void process_arbitrary_vertical(size_t width, Rows src_rows, + Rows buffer_rows) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + vertical_vector_path(src_rows, buffer_rows, index); + }); + + loop.tail([&](size_t index) { + vertical_scalar_path(src_rows, buffer_rows, index); + }); + } + + // Border-affected parts + void process_arbitrary_border_vertical(size_t width, + Rows src_rows, + ptrdiff_t row_index, + Rows buffer_rows) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t column_index) { + vertical_border_vector_path(src_rows, buffer_rows, row_index, + column_index); + }); + + loop.tail([&](size_t column_index) { + vertical_border_scalar_path(src_rows, buffer_rows, row_index, + column_index); + }); + } + + void process_arbitrary_horizontal( + size_t width, size_t kernel_size, Rows buffer_rows, + Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + size_t x = 0; + // Assume that there is always a widening when calculating, so the + // horizontal vector path processes double-width vectors + const size_t num_lanes = BufferVecTraits::num_lanes() / 2; + const size_t block_len = num_lanes; + const size_t margin = kernel_size / 2; + const size_t border_len = buffer_rows.channels() * margin; + const size_t border_process_len = + ((border_len + block_len - 1) / block_len) * block_len; + + for (; x < border_process_len; x += num_lanes) { + horizontal_left_border_vector_path(buffer_rows, dst_rows, x); + } + + // Process data which is not affected by any borders in bulk. + if (width * buffer_rows.channels() > 2 * border_process_len) { + size_t total_width_without_borders = + width * buffer_rows.channels() - 2 * border_process_len; + + LoopUnroll2 loop{total_width_without_borders, + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + horizontal_vector_path(buffer_rows, dst_rows, x + index); + horizontal_vector_path(buffer_rows, dst_rows, + x + index + BufferVecTraits::num_lanes()); + }); + + loop.unroll_once([&](size_t index) { + horizontal_vector_path(buffer_rows, dst_rows, x + index); + }); + + loop.tail([&](size_t index) { + horizontal_scalar_path(buffer_rows, dst_rows, x + index); + }); + + x += total_width_without_borders; + } else { + // rewind if needed, so we'll have exact vector paths at the right side + x = width * buffer_rows.channels() - border_process_len; + } + + for (; x < width * buffer_rows.channels(); x += num_lanes) { + horizontal_right_border_vector_path(buffer_rows, dst_rows, x); + } + } + + private: + void vertical_vector_path(Rows src_rows, + Rows dst_rows, ptrdiff_t x) const { + uint8x16_t src_mid = vld1q_u8(&src_rows[x]); + uint8x8_t half_kernel_mid = vdup_n_u8( + static_cast(half_kernel_u16_[half_kernel_size_ - 1])); + uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid); + uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid); + + ptrdiff_t i = 0; + // Unroll 4 times + for (; i < half_kernel_size_ - 4; i += 4) { + uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]); + uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]); + uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + uint16x8_t prod0_l = vmulq_u16(vec_l, coeff); + uint16x8_t prod0_h = vmulq_u16(vec_h, coeff); + + src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]); + src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]); + vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + vec_h = vaddl_high_u8(src_i, src_j); + coeff = vdupq_n_u16(half_kernel_u16_[i + 1]); + uint16x8_t prod1_l = vmulq_u16(vec_l, coeff); + uint16x8_t prod1_h = vmulq_u16(vec_h, coeff); + + src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]); + src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]); + vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + vec_h = vaddl_high_u8(src_i, src_j); + coeff = vdupq_n_u16(half_kernel_u16_[i + 2]); + uint16x8_t prod2_l = vmulq_u16(vec_l, coeff); + uint16x8_t prod2_h = vmulq_u16(vec_h, coeff); + + src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]); + src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]); + vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + vec_h = vaddl_high_u8(src_i, src_j); + coeff = vdupq_n_u16(half_kernel_u16_[i + 3]); + uint16x8_t prod3_l = vmulq_u16(vec_l, coeff); + uint16x8_t prod3_h = vmulq_u16(vec_h, coeff); + + uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l); + uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h); + uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l); + uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h); + + uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l); + uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h); + + acc_l = vaddq_u16(acc_l, acc_new_l); + acc_h = vaddq_u16(acc_h, acc_new_h); + } + + for (; i < half_kernel_size_ - 1; ++i) { + uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]); + uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]); + uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc_l = vmlaq_u16(acc_l, vec_l, coeff); + acc_h = vmlaq_u16(acc_h, vec_h, coeff); + } + + // Rounding + acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128)); + acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128)); + // Keep only the highest 8 bits + uint8x16_t result = + vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); + neon::VecTraits::store(result, &dst_rows[x]); + } + + // Where y is affected by border + void vertical_border_vector_path(Rows src_rows, + Rows dst_rows, ptrdiff_t y, + ptrdiff_t x) const { + uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]); + uint8x8_t half_kernel_mid = vdup_n_u8( + static_cast(half_kernel_u16_[half_kernel_size_ - 1])); + uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid); + uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid); + + ptrdiff_t i = 0; + for (; i < half_kernel_size_ - 1; ++i) { + uint8x16_t src_i = vld1q_u8(&src_rows.at( + vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]); + uint8x16_t src_j = vld1q_u8(&src_rows.at( + vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]); + uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc_l = vmlaq_u16(acc_l, vec_l, coeff); + acc_h = vmlaq_u16(acc_h, vec_h, coeff); + } + + // Rounding + acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128)); + acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128)); + // Keep only the highest 8 bits + uint8x16_t result = + vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); + neon::VecTraits::store(result, &dst_rows[x]); + } + + void vertical_scalar_path(Rows src_rows, + Rows dst_rows, ptrdiff_t x) const { + uint32_t acc = static_cast(src_rows[x]) * + half_kernel_u16_[half_kernel_size_ - 1]; + + for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) { + acc += + (static_cast(src_rows.at(i + 1 - half_kernel_size_)[x]) + + static_cast(src_rows.at(half_kernel_size_ - i - 1)[x])) * + half_kernel_u16_[i]; + } + + dst_rows[x] = static_cast(rounding_shift_right(acc, 8)); + } + + void vertical_border_scalar_path(Rows src_rows, + Rows dst_rows, ptrdiff_t y, + ptrdiff_t x) const { + uint32_t acc = static_cast(src_rows.at(y)[x]) * + half_kernel_u16_[half_kernel_size_ - 1]; + + for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) { + acc += (static_cast(src_rows.at( + vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x]) + + static_cast(src_rows.at(vertical_border_.get_row( + y + half_kernel_size_ - i - 1))[x])) * + half_kernel_u16_[i]; + } + + dst_rows[x] = static_cast(rounding_shift_right(acc, 8)); + } + + void horizontal_vector_path(Rows src_rows, + Rows dst_rows, + ptrdiff_t x) const { + // very similar to the vertical path, the difference is only the loading + // pattern + uint8x16_t src_mid = vld1q_u8(&src_rows[x]); + uint8x8_t half_kernel_mid = vdup_n_u8( + static_cast(half_kernel_u16_[half_kernel_size_ - 1])); + uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid); + uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid); + + ptrdiff_t ch = static_cast(src_rows.channels()), + left = x - ch * (half_kernel_size_ - 1), + right = x + ch * (half_kernel_size_ - 1); + for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) { + uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]); + uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]); + uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); + uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc_l = vmlaq_u16(acc_l, vec_l, coeff); + acc_h = vmlaq_u16(acc_h, vec_h, coeff); + } + + // Rounding + acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128)); + acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128)); + // Keep only the highest 8 bits + uint8x16_t result = + vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); + neon::VecTraits::store(result, &dst_rows[x]); + } + + void horizontal_left_border_vector_path(Rows src_rows, + Rows dst_rows, + ptrdiff_t x) const { + // similar to the simple horizontal path, except the loading pattern: + // - this is loading indirect columns, and half of that data + uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x])); + uint16x8_t acc = + vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]); + + ptrdiff_t ch = static_cast(src_rows.channels()); + ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1), + right = x + ch * (half_kernel_size_ - 1); + for (; i * ch + left < 0; ++i) { + uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch); + uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch])); + uint16x8_t vec = vaddq_u16(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc = vmlaq_u16(acc, vec, coeff); + } + + for (; i < half_kernel_size_ - 1; ++i) { + uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch])); + uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch])); + uint16x8_t vec = vaddq_u16(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc = vmlaq_u16(acc, vec, coeff); + } + + // Store only the highest 8 bits + uint8x8_t result = vrshrn_n_u16(acc, 8); + vst1_u8(&dst_rows[x], result); + } + + void horizontal_right_border_vector_path(Rows src_rows, + Rows dst_rows, + ptrdiff_t x) const { + // similar to the simple horizontal path, except the loading pattern: + // - this is loading indirect columns, and half of that data + uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x])); + uint16x8_t acc = + vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]); + + ptrdiff_t ch = static_cast(src_rows.channels()); + ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1), + right = x + ch * (half_kernel_size_ - 1); + for (; right - i * ch > width_ * ch - 8; ++i) { + uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch])); + uint16x8_t src_j = + horizontal_border_.load_right(src_rows, right - i * ch); + uint16x8_t vec = vaddq_u16(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc = vmlaq_u16(acc, vec, coeff); + } + + for (; i < half_kernel_size_ - 1; ++i) { + uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch])); + uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch])); + uint16x8_t vec = vaddq_u16(src_i, src_j); + uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); + acc = vmlaq_u16(acc, vec, coeff); + } + + // Store only the highest 8 bits + uint8x8_t result = vrshrn_n_u16(acc, 8); + vst1_u8(&dst_rows[x], result); + } + + void horizontal_scalar_path(Rows src_rows, + Rows dst_rows, + ptrdiff_t x) const { + uint32_t acc = static_cast(src_rows[x]) * + half_kernel_u16_[half_kernel_size_ - 1]; + ptrdiff_t ch = static_cast(src_rows.channels()); + ptrdiff_t channel_offset = x % ch; + ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1), + right_col = x / ch + (half_kernel_size_ - 1); + + for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) { + acc += (static_cast( + src_rows[horizontal_border_.get_column(left_col + i) * ch + + channel_offset]) + + static_cast( + src_rows[horizontal_border_.get_column(right_col - i) * ch + + channel_offset])) * + half_kernel_u16_[i]; + } + + dst_rows[x] = static_cast(rounding_shift_right(acc, 8)); + } + + const ptrdiff_t half_kernel_size_; + const uint16_t *half_kernel_u16_; + const ptrdiff_t width_; + KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical vertical_border_; + KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal + horizontal_border_; +}; // end of class GaussianBlurArbitrary + +template +static kleidicv_error_t gaussian_blur_arbitrary_kernel_size( + const ScalarType *src, size_t src_stride, ScalarType *dst, + size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin, + size_t y_end, size_t channels, float sigma, FixedBorderType border_type, + SeparableFilterWorkspace *workspace) { + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + + const ptrdiff_t kHalfKernelSize = + static_cast(get_half_kernel_size(kernel_size)); + uint16_t half_kernel[128]; + generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma); + // If sigma is so small that the middle point gets all the weights, it's + // just a copy + if (half_kernel[kHalfKernelSize - 1] < 256) { + // Only replicated border is implemented so far. + GaussianBlurArbitrary filter{ + half_kernel, kHalfKernelSize, rect, src_rows.channels()}; + workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows, + dst_rows, channels, border_type, filter); + } else { + for (size_t row = y_begin; row < y_end; ++row) { + std::memcpy(static_cast(&dst_rows.at(row)[0]), + static_cast(&src_rows.at(row)[0]), + rect.width() * sizeof(ScalarType) * dst_rows.channels()); + } + } + return KLEIDICV_OK; +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_arbitrary_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t /*kernel_height*/, float sigma_x, + float /*sigma_y*/, FixedBorderType fixed_border_type, + kleidicv_filter_context_t *context) { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = gaussian_blur_checks( + src, src_stride, dst, dst_stride, width, height, channels, workspace); + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + Rectangle rect{width, height}; + + return gaussian_blur_arbitrary_kernel_size( + src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end, + channels, sigma_x, fixed_border_type, workspace); +} + +} // namespace kleidicv::neon diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_fixed_neon.cpp similarity index 88% rename from kleidicv/src/filters/gaussian_blur_neon.cpp rename to kleidicv/src/filters/gaussian_blur_fixed_neon.cpp index 14c2f6a5e4c4a3bb281590ad0fcc7b005ef5682b..f2c29d6479a8e139dd1ee5d8d53accc172428206 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_fixed_neon.cpp @@ -2,9 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 -#include #include +#include +#include "kleidicv/config.h" #include "kleidicv/ctypes.h" #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/filters/separable_filter_15x15_neon.h" @@ -13,8 +14,8 @@ #include "kleidicv/filters/separable_filter_5x5_neon.h" #include "kleidicv/filters/separable_filter_7x7_neon.h" #include "kleidicv/filters/sigma.h" -#include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/workspace/border_types.h" #include "kleidicv/workspace/separable.h" namespace kleidicv::neon { @@ -301,7 +302,7 @@ class GaussianBlur { static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); - explicit GaussianBlur(const std::array half_kernel) + explicit GaussianBlur(const uint16_t *half_kernel) : half_kernel_(half_kernel) {} void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { @@ -362,7 +363,7 @@ class GaussianBlur { neon::VecTraits::store(result, &dst[0]); } - const std::array half_kernel_; + const uint16_t *half_kernel_; }; // end of class GaussianBlur template @@ -385,7 +386,8 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( return KLEIDICV_OK; } else { constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); - auto half_kernel = generate_gaussian_half_kernel(sigma); + uint16_t half_kernel[128]; + generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma); // If sigma is so small that the middle point gets all the weights, it's // just a copy if (half_kernel[kHalfKernelSize - 1] < 256) { @@ -405,13 +407,11 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( } template -static kleidicv_error_t gaussian_blur(size_t kernel_size, const ScalarType *src, - size_t src_stride, ScalarType *dst, - size_t dst_stride, Rectangle &rect, - size_t y_begin, size_t y_end, - size_t channels, float sigma, - FixedBorderType border_type, - SeparableFilterWorkspace *workspace) { +static kleidicv_error_t gaussian_blur_fixed( + size_t kernel_size, const ScalarType *src, size_t src_stride, + ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin, + size_t y_end, size_t channels, float sigma, FixedBorderType border_type, + SeparableFilterWorkspace *workspace) { switch (kernel_size) { case 3: return gaussian_blur_fixed_kernel_size<3, IsBinomial>( @@ -444,37 +444,8 @@ static kleidicv_error_t gaussian_blur(size_t kernel_size, const ScalarType *src, } } -// Does not include checks for whether the operation is implemented. -// This must be done earlier, by gaussian_blur_is_implemented. -template -static kleidicv_error_t gaussian_blur_checks( - const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, - size_t height, size_t channels, SeparableFilterWorkspace *workspace) { - CHECK_POINTERS(workspace); - - CHECK_POINTER_AND_STRIDE(src, src_stride, height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); - CHECK_IMAGE_SIZE(width, height); - - if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; - } - - if (workspace->channels() < channels) { - return KLEIDICV_ERROR_CONTEXT_MISMATCH; - } - - Rectangle rect{width, height}; - const Rectangle &context_rect = workspace->image_size(); - if (context_rect.width() < width || context_rect.height() < height) { - return KLEIDICV_ERROR_CONTEXT_MISMATCH; - } - - return KLEIDICV_OK; -} - KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t gaussian_blur_stripe_u8( +kleidicv_error_t gaussian_blur_fixed_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t /*kernel_height*/, float sigma_x, @@ -491,14 +462,14 @@ kleidicv_error_t gaussian_blur_stripe_u8( Rectangle rect{width, height}; if (sigma_x == 0.0) { - return gaussian_blur(kernel_width, src, src_stride, dst, dst_stride, - rect, y_begin, y_end, channels, sigma_x, - fixed_border_type, workspace); + return gaussian_blur_fixed(kernel_width, src, src_stride, dst, + dst_stride, rect, y_begin, y_end, channels, + sigma_x, fixed_border_type, workspace); } - return gaussian_blur(kernel_width, src, src_stride, dst, dst_stride, - rect, y_begin, y_end, channels, sigma_x, - fixed_border_type, workspace); + return gaussian_blur_fixed(kernel_width, src, src_stride, dst, + dst_stride, rect, y_begin, y_end, channels, + sigma_x, fixed_border_type, workspace); } } // namespace kleidicv::neon diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_fixed_sc.h similarity index 92% rename from kleidicv/src/filters/gaussian_blur_sc.h rename to kleidicv/src/filters/gaussian_blur_fixed_sc.h index 50494fbd950ccb6c31c99d382d32ce4f3181e685..db9c4096183aafe5a03a9a2b555bb9ddcc0f5a0f 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_fixed_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -8,13 +8,13 @@ #include #include +#include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/filters/separable_filter_15x15_sc.h" #include "kleidicv/filters/separable_filter_21x21_sc.h" #include "kleidicv/filters/separable_filter_3x3_sc.h" #include "kleidicv/filters/separable_filter_5x5_sc.h" #include "kleidicv/filters/separable_filter_7x7_sc.h" #include "kleidicv/filters/sigma.h" -#include "kleidicv/kleidicv.h" #include "kleidicv/workspace/separable.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -259,7 +259,7 @@ class GaussianBlur { static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); - explicit GaussianBlur(const std::array half_kernel) + explicit GaussianBlur(const uint16_t *half_kernel) : half_kernel_(half_kernel) {} void vertical_vector_path( @@ -326,7 +326,7 @@ class GaussianBlur { svst1(pg, &dst[0], result); } - const std::array half_kernel_; + const uint16_t *half_kernel_; }; // end of class GaussianBlur template @@ -349,7 +349,8 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( return KLEIDICV_OK; } else { constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); - auto half_kernel = generate_gaussian_half_kernel(sigma); + uint16_t half_kernel[128]; + generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma); // If sigma is so small that the middle point gets all the weights, it's // just a copy if (half_kernel[kHalfKernelSize - 1] < 256) { @@ -406,36 +407,7 @@ static kleidicv_error_t gaussian_blur( } } -// Does not include checks for whether the operation is implemented. -// This must be done earlier, by gaussian_blur_is_implemented. -template -static kleidicv_error_t gaussian_blur_checks( - const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, - size_t height, size_t channels, - SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { - CHECK_POINTERS(workspace); - - CHECK_POINTER_AND_STRIDE(src, src_stride, height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); - CHECK_IMAGE_SIZE(width, height); - - if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; - } - - if (workspace->channels() < channels) { - return KLEIDICV_ERROR_CONTEXT_MISMATCH; - } - - const Rectangle &context_rect = workspace->image_size(); - if (context_rect.width() < width || context_rect.height() < height) { - return KLEIDICV_ERROR_CONTEXT_MISMATCH; - } - - return KLEIDICV_OK; -} - -static kleidicv_error_t gaussian_blur_stripe_u8_sc( +static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sc( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t /*kernel_height*/, float sigma_x, diff --git a/kleidicv/src/filters/gaussian_blur_fixed_sme2.cpp b/kleidicv/src/filters/gaussian_blur_fixed_sme2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3dfeb802042bb077f7ff7c9e1be6e9e7ae64d0d0 --- /dev/null +++ b/kleidicv/src/filters/gaussian_blur_fixed_sme2.cpp @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gaussian_blur_fixed_sc.h" + +namespace kleidicv::sme2 { + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +gaussian_blur_fixed_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, size_t width, + size_t height, size_t y_begin, size_t y_end, + size_t channels, size_t kernel_width, + size_t kernel_height, float sigma_x, + float sigma_y, FixedBorderType border_type, + kleidicv_filter_context_t *context) { + return gaussian_blur_fixed_stripe_u8_sc( + src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, + kernel_width, kernel_height, sigma_x, sigma_y, border_type, context); +} + +} // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_sve2.cpp b/kleidicv/src/filters/gaussian_blur_fixed_sve2.cpp similarity index 74% rename from kleidicv/src/filters/gaussian_blur_sve2.cpp rename to kleidicv/src/filters/gaussian_blur_fixed_sve2.cpp index a98aed200ad2d250335cc1322f473fde944663d1..c0740a303e67b21a405d43eb4b947fc57d0ea7fb 100644 --- a/kleidicv/src/filters/gaussian_blur_sve2.cpp +++ b/kleidicv/src/filters/gaussian_blur_fixed_sve2.cpp @@ -1,19 +1,18 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 -#include "gaussian_blur_sc.h" -#include "kleidicv/filters/gaussian_blur.h" +#include "gaussian_blur_fixed_sc.h" namespace kleidicv::sve2 { KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t gaussian_blur_stripe_u8( +kleidicv_error_t gaussian_blur_fixed_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, FixedBorderType border_type, kleidicv_filter_context_t *context) { - return gaussian_blur_stripe_u8_sc( + return gaussian_blur_fixed_stripe_u8_sc( src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, kernel_width, kernel_height, sigma_x, sigma_y, border_type, context); } diff --git a/kleidicv/src/filters/gaussian_blur_sme2.cpp b/kleidicv/src/filters/gaussian_blur_sme2.cpp deleted file mode 100644 index 0ea5b4a5dfe82e6416fd90203bde45e34a4b7d28..0000000000000000000000000000000000000000 --- a/kleidicv/src/filters/gaussian_blur_sme2.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#include "gaussian_blur_sc.h" -#include "kleidicv/filters/gaussian_blur.h" - -namespace kleidicv::sme2 { - -KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t -gaussian_blur_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, - size_t dst_stride, size_t width, size_t height, - size_t y_begin, size_t y_end, size_t channels, - size_t kernel_width, size_t kernel_height, - float sigma_x, float sigma_y, - FixedBorderType border_type, - kleidicv_filter_context_t *context) { - return gaussian_blur_stripe_u8_sc( - src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, - kernel_width, kernel_height, sigma_x, sigma_y, border_type, context); -} - -} // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/scharr_sc.h b/kleidicv/src/filters/scharr_sc.h index 836d52ba8e7ecf3596cf14b90c9f836f8be8edd9..2fdeaef9180aacce7e0ff230a694a3de713244db 100644 --- a/kleidicv/src/filters/scharr_sc.h +++ b/kleidicv/src/filters/scharr_sc.h @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include #include diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index 1da0b8db200a9c66a05ba9d08f4d13c7bf1992f6..ac80fd2a7f5d6d4b92306a77243dd1171b53e7ae 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -5,8 +5,6 @@ #ifndef KLEIDICV_RESIZE_LINEAR_SC_H #define KLEIDICV_RESIZE_LINEAR_SC_H -#include - #include #include "kleidicv/kleidicv.h" diff --git a/kleidicv/src/transform/remap_s16_sve2.cpp b/kleidicv/src/transform/remap_s16_sve2.cpp index 4d5442c91d6c9d5172af4dbcdf771af47059524f..e358e2e7c508c116a9a386ee8eeecbd8e3cb43b2 100644 --- a/kleidicv/src/transform/remap_s16_sve2.cpp +++ b/kleidicv/src/transform/remap_s16_sve2.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include #include diff --git a/kleidicv/src/transform/remap_s16point5_sve2.cpp b/kleidicv/src/transform/remap_s16point5_sve2.cpp index d9f94012fa53b9495d6da74c148d76ed9cfcbfa8..92099c734786e0c96840c52bf10bb5c4ca3f8a7d 100644 --- a/kleidicv/src/transform/remap_s16point5_sve2.cpp +++ b/kleidicv/src/transform/remap_s16point5_sve2.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include #include diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp index 7b00b8492c6feab12c0923b58870e4efa5a20fb7..2d7088b35592ee3a0957e6cc5930b57ba729083f 100644 --- a/kleidicv/src/transform/warp_perspective_sve2.cpp +++ b/kleidicv/src/transform/warp_perspective_sve2.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 45a49ef0d10d6d3f720f9363f7c51c4e10bd5992..ca4be36a2e37a46c97a2f4f232513f2892c07009 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -435,19 +435,32 @@ kleidicv_error_t kleidicv_thread_gaussian_blur_u8( size_t kernel_height, float sigma_x, float sigma_y, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, kleidicv_thread_multithreading mt) { - if (!kleidicv::gaussian_blur_is_implemented( - width, height, kernel_width, kernel_height, sigma_x, sigma_y)) { + auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); + if (!fixed_border_type) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); - if (!fixed_border_type) { + if (!kleidicv::gaussian_blur_is_implemented(width, height, kernel_width, + kernel_height, sigma_x, sigma_y, + channels, *fixed_border_type)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } + if (kernel_width <= 7 || kernel_width == 15 || kernel_width == 21) { + auto callback = [=](size_t y_begin, size_t y_end, + kleidicv_filter_context_t *thread_context) { + return kleidicv_gaussian_blur_fixed_stripe_u8( + src, src_stride, dst, dst_stride, width, height, y_begin, y_end, + channels, kernel_width, kernel_height, sigma_x, sigma_y, + *fixed_border_type, thread_context); + }; + return kleidicv_thread_filter(callback, width, height, channels, + kernel_width, kernel_height, context, mt); + } + auto callback = [=](size_t y_begin, size_t y_end, kleidicv_filter_context_t *thread_context) { - return kleidicv_gaussian_blur_stripe_u8( + return kleidicv_gaussian_blur_arbitrary_stripe_u8( src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, kernel_width, kernel_height, sigma_x, sigma_y, *fixed_border_type, thread_context); diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 4d8f8dc5523454a901965efe9c18cf7f9605e6a9..cd94e9b605ae97a6d4bc1d3bfedc853015d20734 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -49,8 +49,10 @@ GaussianBlur7x7: opencv_perf_imgproc '*gaussianBlur7x7/*' '($PIXEL_FORMAT, 8 GaussianBlur3x3_CustomSigma: opencv_perf_imgproc '*gaussianBlur3x3_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur5x5_CustomSigma: opencv_perf_imgproc '*gaussianBlur5x5_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur7x7_CustomSigma: opencv_perf_imgproc '*gaussianBlur7x7_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' +GaussianBlur9x9_CustomSigma: opencv_perf_imgproc '*gaussianBlur9x9_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur15x15_CustomSigma: opencv_perf_imgproc '*gaussianBlur15x15_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur21x21_CustomSigma: opencv_perf_imgproc '*gaussianBlur21x21_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' +GaussianBlur49x49_CustomSigma: opencv_perf_imgproc '*gaussianBlur49x49_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' Sobel_Gx: opencv_perf_imgproc '*Border3x3_sobelFilter.sobelFilter/*' '($PIXEL_FORMAT, 16SC1, (1, 0), BORDER_REPLICATE)' Sobel_Gy: opencv_perf_imgproc '*Border3x3_sobelFilter.sobelFilter/*' '($PIXEL_FORMAT, 16SC1, (0, 1), BORDER_REPLICATE)' diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 0fc515dcebcb9b01e7b56ce46afc42a6858ab17f..4bc90b2cb82e4859d54d905ceeae7fcb0061c789 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -4,10 +4,14 @@ #include +#include +#include + #include "framework/array.h" #include "framework/generator.h" #include "framework/kernel.h" #include "framework/utils.h" +#include "kleidicv/ctypes.h" #include "kleidicv/filters/sigma.h" #include "kleidicv/kleidicv.h" @@ -91,8 +95,9 @@ class GaussianBlurTest : public test::KernelTest { void calculate_mask(test::Array2D &mask) { constexpr size_t kHalfKernelSize = kleidicv::get_half_kernel_size(kKernelSize); - auto half_kernel = - kleidicv::generate_gaussian_half_kernel(sigma_); + std::vector half_kernel(kHalfKernelSize); + kleidicv::generate_gaussian_half_kernel(half_kernel.data(), kHalfKernelSize, + sigma_); for (size_t row = 0; row < kKernelSize; ++row) { for (size_t column = 0; column < kKernelSize; ++column) { *mask.at(row, column) = @@ -222,63 +227,100 @@ TYPED_TEST(GaussianBlur, 7x7) { GaussianBlurTest{KernelTestParams{}}.test(mask); } -const auto minimal_array_layouts = [](size_t w, size_t h) { - size_t vl = test::Options::vector_length(); - size_t margin = w / 2; +const auto minimal_array_layouts_for_fixed = [](size_t min_w, size_t min_h) { + // Number of 16-bit elements in a SIMD vector (maximum border length) + size_t margin = min_w / 2; + // minimum allowed width, needed for the fixed size kernels to activate the + // NEON scalar path + size_t min_width = 2 * margin + 1; + // two borders + unrolltwice + unrollonce + one for the tail + size_t big_width = 2 * margin + (2 + 1) * test::Options::vector_length(); + return std::array{{ + {1 * min_width, min_h + 1, 1, 1}, + {2 * min_width, min_h, 1, 2}, + {3 * min_width, min_h, 1, 3}, + {big_width, min_h, 1, 1}, + }}; +}; + +const auto minimal_array_layouts_for_arbitrary = [](size_t min_w, + size_t min_h) { + // Number of 16-bit elements in a SIMD vector (maximum border length) + size_t mbl = test::Options::vector_length() / 2; + size_t margin = min_w / 2; + size_t min_width = (margin + mbl - 1) / mbl * mbl + margin; // two borders + one for the tail, so the NEON scalar path activates - size_t small_width = 2 * margin + 1; + size_t small_width = 2 * ((margin + mbl - 1) / mbl * mbl) + 1; // two borders + unrolltwice + unrollonce + one for the tail - size_t big_width = 2 * margin + 3 * vl + 1; - return std::array{{ - {small_width, 2 * margin + 1, 1, 1}, - {big_width, h, 1, 1}, + size_t big_width = small_width + (2 + 1) * test::Options::vector_length(); + return std::array{{ + {1 * min_width, min_h + 1, 1, 1}, + {1 * small_width, min_h + 1, 1, 1}, + {2 * min_width, min_h, 1, 2}, + {3 * min_width, min_h, 1, 3}, + {big_width, min_h, 1, 1}, }}; }; +size_t minimumValidWidth(size_t kernel_size, size_t vector_length) { + if (kernel_size <= 7 || kernel_size == 15 || kernel_size == 21) { + return kernel_size - 1; + } + size_t margin = kernel_size / 2; + // Maximum Border Length + size_t bl = vector_length / 2; + return (margin + bl - 1) / bl * bl + margin; +} + TYPED_TEST(GaussianBlur, 3x3_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); -} - -TYPED_TEST(GaussianBlur, 3x3_TinySigma) { - using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } TYPED_TEST(GaussianBlur, 5x5_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); -} - -TYPED_TEST(GaussianBlur, 5x5_TinySigma) { - using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } TYPED_TEST(GaussianBlur, 7x7_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} + .with_sigma(0.01) + .test_with_generated_mask(); } -TYPED_TEST(GaussianBlur, 7x7_TinySigma) { - using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} +// 11x11 use the generic solution. +TYPED_TEST(GaussianBlur, 11x11_CustomSigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + // TODO kReplicateBorder is temporary until we implement all borders + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_arbitrary, + kReplicateBorder, kToleranceOne} + .test_with_generated_mask(); + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_arbitrary, + kReplicateBorder, kToleranceOne} + .with_sigma(2.2) + .test_with_generated_mask(); + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_arbitrary, + kReplicateBorder, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } @@ -286,19 +328,15 @@ TYPED_TEST(GaussianBlur, 7x7_TinySigma) { // Tests gaussian_blur_15x15_ API. It always uses CustomSigma. TYPED_TEST(GaussianBlur, 15x15_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .test_with_generated_mask(); - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); -} - -TYPED_TEST(GaussianBlur, 15x15_TinySigma) { - using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } @@ -306,19 +344,16 @@ TYPED_TEST(GaussianBlur, 15x15_TinySigma) { // Tests gaussian_blur_21x21_ API. It always uses CustomSigma. TYPED_TEST(GaussianBlur, 21x21_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + // TODO kReplicateBorder is temporary until we implement all borders + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .test_with_generated_mask(); - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); -} - -TYPED_TEST(GaussianBlur, 21x21_TinySigma) { - using KernelTestParams = GaussianBlurKernelTestParams; - GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, - kToleranceOne} + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, + kAllBorders, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } @@ -326,7 +361,8 @@ TYPED_TEST(GaussianBlur, 21x21_TinySigma) { TYPED_TEST(GaussianBlur, UnsupportedBorderType3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 3, 3, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -346,7 +382,8 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType3x3) { TYPED_TEST(GaussianBlur, UnsupportedBorderType5x5) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 5, 5, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -366,7 +403,8 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType5x5) { TYPED_TEST(GaussianBlur, UnsupportedBorderType7x7) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 7, 7, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -383,10 +421,36 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType7x7) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } +// Arbitrary kernel size algorithm only supports REPLICATED borders +TYPED_TEST(GaussianBlur, UnsupportedBorderType11x11) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 11, 11, + validSize, validSize)); + TypeParam src[1] = {}, dst[1]; + for (kleidicv_border_type_t border : { + KLEIDICV_BORDER_TYPE_WRAP, + KLEIDICV_BORDER_TYPE_REVERSE, + KLEIDICV_BORDER_TYPE_REFLECT, + KLEIDICV_BORDER_TYPE_CONSTANT, + KLEIDICV_BORDER_TYPE_TRANSPARENT, + KLEIDICV_BORDER_TYPE_NONE, + }) { + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, 11, 11, 0.0, 0.0, border, context)); + } + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); +} + TYPED_TEST(GaussianBlur, UnsupportedBorderType15x15) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -406,7 +470,8 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType15x15) { TYPED_TEST(GaussianBlur, UnsupportedBorderType21x21) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 21, 21, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -426,7 +491,8 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType21x21) { TYPED_TEST(GaussianBlur, DifferentKernelSize) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -443,7 +509,8 @@ TYPED_TEST(GaussianBlur, DifferentKernelSize) { TYPED_TEST(GaussianBlur, NonZeroSigma) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -470,17 +537,24 @@ TYPED_TEST(GaussianBlur, NonZeroSigma) { } TYPED_TEST(GaussianBlur, UnsupportedKernelSize) { - using KernelTestParams = GaussianBlurKernelTestParams; + using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 257, 257, validSize, validSize)); TypeParam src[1] = {}, dst[1]; EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - validSize, validSize, 1, 33, 33, 0.0, 0.0, + validSize, validSize, 1, 1, 1, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), + validSize, validSize, 1, 257, 257, 0.0, 0.0, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); @@ -489,7 +563,8 @@ TYPED_TEST(GaussianBlur, UnsupportedKernelSize) { TYPED_TEST(GaussianBlur, NullPointer) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -515,7 +590,8 @@ TYPED_TEST(GaussianBlur, Misalignment) { } using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); TypeParam src[1] = {}, dst[1]; @@ -634,7 +710,8 @@ TYPED_TEST(GaussianBlur, ZeroImageSize15x15) { TYPED_TEST(GaussianBlur, ValidImageSize3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 3, 3, validSize, validSize)); test::Array2D src{validSize, validSize, @@ -669,7 +746,8 @@ TYPED_TEST(GaussianBlur, ValidImageSize3x3) { TYPED_TEST(GaussianBlur, ValidImageSize5x5) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 5, 5, validSize, validSize)); test::Array2D src{validSize, validSize, @@ -710,7 +788,8 @@ TYPED_TEST(GaussianBlur, ValidImageSize5x5) { TYPED_TEST(GaussianBlur, ValidImageSize7x7) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 7, 7, validSize, validSize)); test::Array2D src{validSize, validSize, @@ -754,108 +833,58 @@ TYPED_TEST(GaussianBlur, ValidImageSize7x7) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } -TYPED_TEST(GaussianBlur, UndersizeImage3x3) { - using KernelTestParams = GaussianBlurKernelTestParams; +template +void test_undersize_image() { kleidicv_filter_context_t *context = nullptr; - size_t underSize = KernelTestParams::kKernelSize - 2; - size_t validSize = KernelTestParams::kKernelSize - 1; - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 3, 3, - validSize, validSize)); + // 8 is the number of 16-bit elements in a 128-bit vector. + size_t validWidth = minimumValidWidth(kKernelSize, 8); + size_t underWidth = validWidth - 1; + size_t validHeight = kKernelSize - 1; + size_t underHeight = validHeight - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create( + &context, 1, kKernelSize, kKernelSize, validWidth, validWidth)); TypeParam src[1] = {}, dst[1]; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, underSize, 1, 3, 3, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, validSize, 1, 3, 3, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - validSize, underSize, 1, 3, 3, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underWidth, + underHeight, 1, kKernelSize, kKernelSize, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underWidth, + validHeight, 1, kKernelSize, kKernelSize, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validWidth, + underHeight, 1, kKernelSize, kKernelSize, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } +TYPED_TEST(GaussianBlur, UndersizeImage3x3) { + test_undersize_image(); +} + TYPED_TEST(GaussianBlur, UndersizeImage5x5) { - using KernelTestParams = GaussianBlurKernelTestParams; - kleidicv_filter_context_t *context = nullptr; - size_t underSize = KernelTestParams::kKernelSize - 2; - size_t validSize = KernelTestParams::kKernelSize - 1; - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 5, 5, - validSize, validSize)); - TypeParam src[1] = {}, dst[1]; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, underSize, 1, 5, 5, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, validSize, 1, 5, 5, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - validSize, underSize, 1, 5, 5, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + test_undersize_image(); } TYPED_TEST(GaussianBlur, UndersizeImage7x7) { - using KernelTestParams = GaussianBlurKernelTestParams; - kleidicv_filter_context_t *context = nullptr; - size_t underSize = KernelTestParams::kKernelSize - 2; - size_t validSize = KernelTestParams::kKernelSize - 1; - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 7, 7, - validSize, validSize)); - TypeParam src[1] = {}, dst[1]; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, underSize, 1, 7, 7, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, validSize, 1, 7, 7, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - validSize, underSize, 1, 7, 7, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + test_undersize_image(); +} + +TYPED_TEST(GaussianBlur, UndersizeImage9x9) { + test_undersize_image(); } TYPED_TEST(GaussianBlur, UndersizeImage15x15) { - using KernelTestParams = GaussianBlurKernelTestParams; - kleidicv_filter_context_t *context = nullptr; - size_t underSize = KernelTestParams::kKernelSize - 2; - size_t validSize = KernelTestParams::kKernelSize - 1; - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, - validSize, validSize)); - TypeParam src[1] = {}, dst[1]; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, underSize, 1, 15, 15, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - underSize, validSize, 1, 15, 15, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), - validSize, underSize, 1, 15, 15, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REPLICATE, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + test_undersize_image(); +} + +TYPED_TEST(GaussianBlur, UndersizeImage21x21) { + test_undersize_image(); } TYPED_TEST(GaussianBlur, OversizeImage) { @@ -866,20 +895,26 @@ TYPED_TEST(GaussianBlur, OversizeImage) { EXPECT_EQ(KLEIDICV_ERROR_RANGE, gaussian_blur()( src, sizeof(TypeParam), dst, sizeof(TypeParam), - KLEIDICV_MAX_IMAGE_PIXELS + 1, 15, 1, 15, 15, 0.0, 0.0, + KLEIDICV_MAX_IMAGE_PIXELS + 1, 14, 1, 15, 15, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 14, + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 15, 15, 0.0, 0.0, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_ERROR_RANGE, gaussian_blur()( src, sizeof(TypeParam), dst, sizeof(TypeParam), - KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 15, 15, - 15, 0.0, 0.0, KLEIDICV_BORDER_TYPE_REFLECT, context)); + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, 15, 15, + 0.0, 0.0, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } TYPED_TEST(GaussianBlur, ChannelNumber) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); @@ -895,7 +930,8 @@ TYPED_TEST(GaussianBlur, ChannelNumber) { TYPED_TEST(GaussianBlur, InvalidContextMaxChannels) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); @@ -905,13 +941,19 @@ TYPED_TEST(GaussianBlur, InvalidContextMaxChannels) { gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 2, 15, 15, 0.0, 0.0, KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ( + KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), + validSize, validSize, 2, 11, 11, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextImageSize) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); @@ -931,13 +973,29 @@ TYPED_TEST(GaussianBlur, InvalidContextImageSize) { gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, validSize + 1, 1, 15, 15, 0.0, 0.0, KLEIDICV_BORDER_TYPE_REFLECT, context)); + // Arbitrary kernel size as well + EXPECT_EQ( + KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), + validSize + 1, validSize, 1, 11, 11, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ( + KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), + validSize, validSize + 1, 1, 11, 11, 0.0, 0.0, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ( + KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur()(src, sizeof(TypeParam), dst, sizeof(TypeParam), + validSize + 1, validSize + 1, 1, 11, 11, 0.0, + 0.0, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } TYPED_TEST(GaussianBlur, InvalidKernelSize) { kleidicv_filter_context_t *context = nullptr; - size_t kernel_size = 17; + size_t kernel_size = 16; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create( &context, 1, 15, 15, kernel_size, kernel_size)); @@ -959,7 +1017,8 @@ TYPED_TEST(GaussianBlur, InvalidKernelSize) { TYPED_TEST(GaussianBlur, InvalidBorderType) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; + size_t validSize = minimumValidWidth(KernelTestParams::kKernelSize, + test::Options::vector_length()); ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, validSize, validSize)); @@ -982,16 +1041,17 @@ TYPED_TEST(GaussianBlur, InvalidBorderType) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } -template -static std::array generate_reference_kernel(float sigma) { - std::array float_kernel{}; +static std::vector generate_reference_kernel(size_t half_size, + float sigma) { + std::vector float_kernel(half_size); - for (size_t i = 0; i < Size; ++i) { - float_kernel[i] = std::exp(-1 * std::pow(i, 2) / (2 * std::pow(sigma, 2))); + for (size_t i = 0; i < half_size; ++i) { + float_kernel[i] = static_cast( + std::exp(-std::pow(i, 2) / (2 * std::pow(sigma, 2)))); } float sum = 0; - for (auto val : float_kernel) { + for (float val : float_kernel) { sum += val; } @@ -1005,14 +1065,15 @@ static std::array generate_reference_kernel(float sigma) { val *= 256; } - std::array kernel_to_return{}; + std::vector kernel_to_return(half_size); // Conversion with rounding error diffusion float last_rounding_error = 0.0; - for (size_t i = 0; i < Size; ++i) { - float corrected_value = float_kernel[Size - 1 - i] - last_rounding_error; + for (size_t i = 0; i < half_size; ++i) { + float corrected_value = + float_kernel[half_size - 1 - i] - last_rounding_error; float rounded_value = std::round(corrected_value); last_rounding_error = rounded_value - corrected_value; - kernel_to_return[i] = rounded_value; + kernel_to_return[i] = static_cast(rounded_value); } return kernel_to_return; @@ -1020,19 +1081,18 @@ static std::array generate_reference_kernel(float sigma) { template void test_sigma() { - const std::array expected_half_kernel = - generate_reference_kernel(3.0); - const std::array actual_half_kernel = - kleidicv::generate_gaussian_half_kernel(3.0); + const std::vector expected_half_kernel = + generate_reference_kernel(Size, 3.0); + std::vector actual_half_kernel(Size); + kleidicv::generate_gaussian_half_kernel(actual_half_kernel.data(), Size, 3.0); EXPECT_EQ(expected_half_kernel, actual_half_kernel); - const std::array expected_half_kernel1 = - generate_reference_kernel(((Size * 2) - 1) * 0.15 + 0.35); - const std::array actual_half_kernel1 = - kleidicv::generate_gaussian_half_kernel(0.0); + const std::vector expected_half_kernel1 = + generate_reference_kernel(Size, ((Size * 2) - 1) * 0.15 + 0.35); + kleidicv::generate_gaussian_half_kernel(actual_half_kernel.data(), Size, 0.0); - EXPECT_EQ(expected_half_kernel1, actual_half_kernel1); + EXPECT_EQ(expected_half_kernel1, actual_half_kernel); } TYPED_TEST(GaussianBlur, KernelGenerationFromSigma) { diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 03649e374a124c2a992f5bc437d17449aaf62a36..9a3263dc5256ff461ef8a8722ea413e8761694b5 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -278,6 +278,25 @@ class Thread : public testing::TestWithParam

{ EXPECT_EQ(KLEIDICV_OK, multi_result); EXPECT_EQ_ARRAY2D(dst_multi, dst_single); } + + void check_gaussian_blur_u8(unsigned width, unsigned height, + size_t kernel_size) { + size_t channels = 1; + size_t kernel_width = kernel_size; + size_t kernel_height = kernel_size; + float sigma_x = 0.0F, sigma_y = 0.0F; + kleidicv_border_type_t border_type = KLEIDICV_BORDER_TYPE_REPLICATE; + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, channels, kernel_width, + kernel_height, width, height)); + check_unary_op( + kleidicv_gaussian_blur_u8, kleidicv_thread_gaussian_blur_u8, + channels /*src_channels*/, channels /*dst_channels*/, + /*remaining arguments passed to gaussian_blur_u8 functions*/ channels, + kernel_width, kernel_height, sigma_x, sigma_y, border_type, context); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + } }; #define TEST_UNARY_OP(suffix, SrcT, DstT, ...) \ @@ -348,25 +367,18 @@ TEST_BINARY_OP(compare_equal_u8, uint8_t, 1, 1); TEST_BINARY_OP(compare_greater_u8, uint8_t, 1, 1); TEST_BINARY_OP(saturating_add_abs_with_threshold_s16, int16_t, 1, 1, 123); -TEST_P(Thread, gaussian_blur_u8) { +TEST_P(Thread, gaussian_blur_fixed_u8) { unsigned width = 0, height = 0, thread_count = 0; std::tie(width, height, thread_count) = GetParam(); (void)thread_count; - size_t channels = 1; - size_t kernel_width = 5; - size_t kernel_height = kernel_width; - float sigma_x = 0.0F, sigma_y = 0.0F; - kleidicv_border_type_t border_type = KLEIDICV_BORDER_TYPE_REPLICATE; - kleidicv_filter_context_t *context = nullptr; - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_context_create(&context, channels, kernel_width, - kernel_height, width, height)); - check_unary_op( - kleidicv_gaussian_blur_u8, kleidicv_thread_gaussian_blur_u8, - channels /*src_channels*/, channels /*dst_channels*/, - /*remaining arguments passed to gaussian_blur_u8 functions*/ channels, - kernel_width, kernel_height, sigma_x, sigma_y, border_type, context); - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + check_gaussian_blur_u8(width, height, 5); +} + +TEST_P(Thread, gaussian_blur_arbitrary_u8) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + (void)thread_count; + check_gaussian_blur_u8(width, height, 11); } template