diff --git a/CHANGELOG.md b/CHANGELOG.md index 935e95a4877abf203a863049fa10e26bbee5d438..2584beee116f11cedd3852eec3d16a6ebea68e4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This changelog aims to follow the guiding principles of ### Added - Median Blur for 3x3 kernels. -- Median Blur for 9x9, 11x11, 13x13 and 15x15 kernels, Neon backend only. +- Median Blur for generic kernels (odd-sized only, max kernel size 255x255), Neon backend only. ### Changed - Performance of Gaussian Blur is greatly improved in return for some accuracy. diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 4d35a01111cfb5c5ac935bcbb8987dd545c530b7..75d9cf5441decaaea6af89a89f8fd114bef6d913 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -616,7 +616,10 @@ BENCHMARK_TEMPLATE2_CAPTURE(median_blur, uint8_t, 1, , kleidicv_median_blur_u8) ->Arg(9) ->Arg(11) ->Arg(13) - ->Arg(15); + ->Arg(15) + ->Arg(17) + ->Arg(27) + ->Arg(35); BENCHMARK_TEMPLATE2_CAPTURE(median_blur, uint8_t, 4, , kleidicv_median_blur_u8) ->Arg(3) ->Arg(5) @@ -624,7 +627,10 @@ BENCHMARK_TEMPLATE2_CAPTURE(median_blur, uint8_t, 4, , kleidicv_median_blur_u8) ->Arg(9) ->Arg(11) ->Arg(13) - ->Arg(15); + ->Arg(15) + ->Arg(17) + ->Arg(27) + ->Arg(35); BENCHMARK_TEMPLATE2_CAPTURE(median_blur, int8_t, 1, , kleidicv_median_blur_s8) ->Arg(3) diff --git a/conformity/opencv/test_median_blur.cpp b/conformity/opencv/test_median_blur.cpp index 85b78bc29be546ac4a395c7c88e905b7406c3f27..f9b9bffb1a323f3b1368bd4a24083a12410dc782 100644 --- a/conformity/opencv/test_median_blur.cpp +++ b/conformity/opencv/test_median_blur.cpp @@ -19,9 +19,23 @@ bool test_median_blur(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { cv::RNG rng(0); - for (size_t x = 5; x <= 16; ++x) { - for (size_t y = 5; y <= 16; ++y) { - cv::Mat input(x, y, get_opencv_matrix_type()); + size_t size_min{0}; + size_t size_max{0}; + size_t step{0}; + + if constexpr (KernelSize <= 15) { + size_min = KernelSize - 1; + size_max = 2 * KernelSize + 16; + step = 1; + } else { + size_min = KernelSize - 1; + size_max = KernelSize + 16; + step = 16; + } + + for (size_t w = size_min; w <= size_max; w += step) { + for (size_t h = size_min; h <= size_max; h += step) { + cv::Mat input(w, h, get_opencv_matrix_type()); rng.fill(input, cv::RNG::UNIFORM, std::numeric_limits::min(), std::numeric_limits::max()); @@ -30,7 +44,7 @@ bool test_median_blur(int index, RecreatedMessageQueue& request_queue, reply_queue, input); if (are_matrices_different(0, actual, expected)) { - fail_print_matrices(x, y, input, actual, expected); + fail_print_matrices(w, h, input, actual, expected); return true; } } @@ -82,6 +96,18 @@ std::vector& median_blur_tests_get() { TEST("Median 15x15, 1 channel (U8)", (test_median_blur<15, uint8_t, 1>), exec_median_blur<15>), TEST("Median 15x15, 3 channel (U8)", (test_median_blur<15, uint8_t, 3>), exec_median_blur<15>), TEST("Median 15x15, 4 channel (U8)", (test_median_blur<15, uint8_t, 4>), exec_median_blur<15>), + TEST("Median 17x17, 1 channel (U8)", (test_median_blur<17, uint8_t, 1>), exec_median_blur<17>), + TEST("Median 17x17, 3 channel (U8)", (test_median_blur<17, uint8_t, 3>), exec_median_blur<17>), + TEST("Median 17x17, 4 channel (U8)", (test_median_blur<17, uint8_t, 4>), exec_median_blur<17>), + TEST("Median 27x27, 1 channel (U8)", (test_median_blur<27, uint8_t, 1>), exec_median_blur<27>), + TEST("Median 27x27, 3 channel (U8)", (test_median_blur<27, uint8_t, 3>), exec_median_blur<27>), + TEST("Median 27x27, 4 channel (U8)", (test_median_blur<27, uint8_t, 4>), exec_median_blur<27>), + TEST("Median 35x35, 1 channel (U8)", (test_median_blur<35, uint8_t, 1>), exec_median_blur<35>), + TEST("Median 35x35, 3 channel (U8)", (test_median_blur<35, uint8_t, 3>), exec_median_blur<35>), + TEST("Median 35x35, 4 channel (U8)", (test_median_blur<35, uint8_t, 4>), exec_median_blur<35>), + TEST("Median 255x255, 1 channel (U8)", (test_median_blur<255, uint8_t, 1>), exec_median_blur<255>), + TEST("Median 255x255, 3 channel (U8)", (test_median_blur<255, uint8_t, 3>), exec_median_blur<255>), + TEST("Median 255x255, 4 channel (U8)", (test_median_blur<255, uint8_t, 4>), exec_median_blur<255>), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 255fb9beb1aef304bac65b0e1b35d6024c505f7c..984e784e0d005b40455e4caff0cdaa3101406985 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -80,7 +80,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Separable Filter 2D (5x5) | | x | x | x | | | | | Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | | Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | -| Median Blur (9x9, 11x11, 13x13, 15x15) | | x | | | | | | +| Median Blur (generic imp, max size 255x255) | | x | | | | | | ## Resize to quarter diff --git a/doc/opencv.md b/doc/opencv.md index 5175804c662b216a99137ef903004007d8ab9698..9672b1e58f043ab281860b857d6d700b88657c79 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -166,10 +166,8 @@ Applies median filter to a given image. Notes on parameters: * `src.cols`,`src.rows` - image width and height must be greater than or equal to `ksize - 1` (e.g., `>= 4` for 5x5, `>= 14` for 15x15). -* `ksize` - - for `CV_8U`, supported kernel sizes are 3x3, 5×5, 7×7, 9×9, 11×11, 13×13, and 15×15. - - For other types, only 3x3, 5×5 and 7×7 are supported. - +* `ksize` - for `CV_8U`, supported kernel sizes are 3×3 to 255x255.\ + For other types, only 3x3, 5×5 and 7×7 are supported. ### [`cv::transpose()`](https://docs.opencv.org/4.10.0/d2/de8/group__core__array.html#ga46630ed6c0ea6254a35f447289bd7404) Transposes a matrix. diff --git a/kleidicv/include/kleidicv/filters/median_blur.h b/kleidicv/include/kleidicv/filters/median_blur.h index 16a30f7acc32853dbd94fd9add739ab9ef6bd57d..8fbaf2c36db306fa94328c8ec964efece25f6ec5 100644 --- a/kleidicv/include/kleidicv/filters/median_blur.h +++ b/kleidicv/include/kleidicv/filters/median_blur.h @@ -93,6 +93,16 @@ KLEIDICV_API_DECLARATION(kleidicv_median_blur_small_hist_stripe_u8, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, kleidicv::FixedBorderType border_type); + +// For internal use only. See instead kleidicv_median_blur_u8. +// Find a median across an image. +// The stripe is defined by the range (y_begin, y_end]. +KLEIDICV_API_DECLARATION(kleidicv_median_blur_large_hist_stripe_u8, + const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, + kleidicv::FixedBorderType border_type); } namespace kleidicv { @@ -108,6 +118,11 @@ kleidicv_error_t median_blur_small_hist_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, FixedBorderType border_type); + +kleidicv_error_t median_blur_large_hist_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, FixedBorderType border_type); } // namespace neon namespace sve2 { @@ -142,10 +157,16 @@ inline kleidicv_error_t check_ptrs_strides_imagesizes(const T *src, return KLEIDICV_OK; } +template inline bool is_kernel_size_supported(size_t kernel_width, size_t kernel_height) { - return (kernel_width == kernel_height) && (kernel_width >= 3) && - (kernel_width <= 15) && ((kernel_width % 2) != 0); + if constexpr (std::is_same_v) { + return (kernel_width == kernel_height) && (kernel_width >= 3) && + (kernel_width <= 255) && ((kernel_width % 2) != 0); + } else { + return (kernel_width == kernel_height) && (kernel_width >= 3) && + (kernel_width <= 7) && ((kernel_width % 2) != 0); + } } template @@ -163,7 +184,7 @@ inline std::pair median_blur_is_implemented( if ((src != dst) && (channels <= KLEIDICV_MAXIMUM_CHANNEL_COUNT) && (height >= kernel_height - 1) && (width >= kernel_width - 1) && - is_kernel_size_supported(kernel_width, kernel_height) && + is_kernel_size_supported(kernel_width, kernel_height) && fixed_border_type.has_value()) { return std::make_pair(KLEIDICV_OK, *fixed_border_type); } diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 0b56a5a2065a0a66cbd1476d9d23aca5f344a03e..257470a2ffb0e78bc3a9e9a258b7adaf407eaddd 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -2058,12 +2058,12 @@ kleidicv_error_t kleidicv_warp_perspective_u8( /// @ref KLEIDICV_MAXIMUM_CHANNEL_COUNT. /// @param kernel_width Width of the Median kernel. Must be odd and equal to /// `kernel_height`. -/// For `uint8_t`, values 3 to 15 are supported. For other -/// types, only 3, 5 and 7. +/// For `uint8_t`, values 3 to 255 are supported. For +/// other types, only 3, 5 and 7. /// @param kernel_height Height of the Median kernel. Must be odd and equal to /// `kernel_width`. -/// For `uint8_t`, values 3 to 15 are supported. For other -/// types, only 3, 5 and 7. +/// For `uint8_t`, values 3 to 255 are supported. For +/// other types, only 3, 5 and 7. /// @param border_type Way of handling the border. The supported border types /// are: \n /// - @ref KLEIDICV_BORDER_TYPE_REPLICATE \n diff --git a/kleidicv/src/filters/median_blur_api.cpp b/kleidicv/src/filters/median_blur_api.cpp index 5c4b79c22aaf2fb8dcd0ab57c224545ee2104f99..2a5e20ac181013f74c9910963071d321f2eb79c5 100644 --- a/kleidicv/src/filters/median_blur_api.cpp +++ b/kleidicv/src/filters/median_blur_api.cpp @@ -33,6 +33,10 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_median_blur_small_hist_stripe_u8, &kleidicv::neon::median_blur_small_hist_stripe_u8, nullptr, nullptr); +KLEIDICV_MULTIVERSION_C_API(kleidicv_median_blur_large_hist_stripe_u8, + &kleidicv::neon::median_blur_large_hist_stripe_u8, + nullptr, nullptr); + extern "C" { kleidicv_error_t kleidicv_median_blur_s8(const int8_t *src, size_t src_stride, @@ -70,13 +74,19 @@ kleidicv_error_t kleidicv_median_blur_u8(const uint8_t *src, size_t src_stride, return checks_result; } - if (kernel_width > 7) { + if (kernel_width <= 7) { + return kleidicv_median_blur_sorting_network_stripe_u8( + src, src_stride, dst, dst_stride, width, height, 0, height, channels, + kernel_width, kernel_height, fixed_border_type); + } + + if (kernel_width > 7 && kernel_width <= 15) { return kleidicv_median_blur_small_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, 0, height, channels, kernel_width, kernel_height, fixed_border_type); } - return kleidicv_median_blur_sorting_network_stripe_u8( + return kleidicv_median_blur_large_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, 0, height, channels, kernel_width, kernel_height, fixed_border_type); } diff --git a/kleidicv/src/filters/median_blur_border_handling.h b/kleidicv/src/filters/median_blur_border_handling.h new file mode 100644 index 0000000000000000000000000000000000000000..3feab6d41474083409d2c29595695f14b8d739ea --- /dev/null +++ b/kleidicv/src/filters/median_blur_border_handling.h @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_MEDIAN_BLUR_BORDER_HANDLING_H +#define KLEIDICV_MEDIAN_BLUR_BORDER_HANDLING_H + +#include + +#include "kleidicv/kleidicv.h" + +namespace kleidicv::neon { + +static ptrdiff_t get_physical_index(size_t index, size_t limit, + FixedBorderType border_type) { + int result = 0; + int signed_index = static_cast(index); + int signed_limit = static_cast(limit); + + if (signed_index >= 0 && signed_index < signed_limit) { + return static_cast(index); + } + switch (border_type) { + case FixedBorderType::REFLECT: { + if (signed_index < 0) { + result = -signed_index - 1; + } else { + result = 2 * signed_limit - signed_index - 1; + } + break; + } + + case FixedBorderType::WRAP: { + if (signed_index < 0) { + result = signed_limit + signed_index; + } else { + result = signed_index - signed_limit; + } + break; + } + + case FixedBorderType::REVERSE: { + if (signed_index < 0) { + result = std::min(-signed_index, signed_limit - 1); + } else { + result = 2 * signed_limit - signed_index - 2; + } + break; + } + default: /* FixedBorderType::REPLICATE */ { + result = std::clamp(signed_index, 0, signed_limit - 1); + break; + } + } + + return static_cast(result); +} + +} // namespace kleidicv::neon +#endif // KLEIDICV_MEDIAN_BLUR_BORDER_HANDLING_H diff --git a/kleidicv/src/filters/median_blur_large_hist_neon.cpp b/kleidicv/src/filters/median_blur_large_hist_neon.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cae3fa15ccfcc4be7971040d97411167ccae286e --- /dev/null +++ b/kleidicv/src/filters/median_blur_large_hist_neon.cpp @@ -0,0 +1,649 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/median_blur.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/neon.h" +#include "median_blur_border_handling.h" + +namespace kleidicv::neon { + +template +class MedianBlurLargeHist { + public: + MedianBlurLargeHist(size_t channels, size_t kMargin) + : patched_coarse{static_cast( + malloc((16 + 256) * (patch_length + 2 * kMargin) * channels * + sizeof(uint16_t)))}, + patched_fine{ + &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]}, + H{}, + luc{} {} + + ~MedianBlurLargeHist() { free(patched_coarse); } + + void process_pixels_without_horizontal_borders( + Rectangle image_dimensions, Point starting_coordinates, + Point ending_coordinates, Rows src_rows, + Rows dst_rows, size_t ksize, FixedBorderType border_type) { + const size_t kMargin = ksize / 2; + + for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); + w += patch_length) { + const size_t total_patch_span = + std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; + + Rows shifted_dst{ + &dst_rows[0] + static_cast(w) * dst_rows.channels(), + dst_rows.stride(), dst_rows.channels()}; + + clear_patched_histogram(total_patch_span * src_rows.channels()); + + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + + // We initialize with ksize rows to allow merging of + // histogram increment and decrement operations in the main loop. + // This extra initial load enables a single update phase and avoids + // splitting the logic into separate steps. + for (size_t r = 0; r < ksize; ++r) { + const ptrdiff_t valid_h = + get_physical_index(starting_coordinates.y() + r - kMargin, + image_dimensions.height(), border_type); + initialize_patched_histogram_without_horizontal_borders( + src_rows, c, valid_h, total_patch_span, w, kMargin); + } + compute_patch_median_from_histogram(starting_coordinates.y(), c, + total_patch_span, kMargin, ksize, + shifted_dst); + } + + for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); + ++h) { + const ptrdiff_t valid_old_h = get_physical_index( + h - kMargin - 1, image_dimensions.height(), border_type); + + const ptrdiff_t valid_new_h = get_physical_index( + h + kMargin, image_dimensions.height(), border_type); + + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + + update_patch_histogram_without_horizontal_borders( + src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, + kMargin); + + compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, + ksize, shifted_dst); + } + } + } + } + + void process_pixels_with_horizontal_borders( + Rectangle image_dimensions, Point starting_coordinates, + Point ending_coordinates, Rows src_rows, + Rows dst_rows, size_t ksize, FixedBorderType border_type) { + const size_t kMargin = ksize / 2; + + for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); + w += patch_length) { + const size_t total_patch_span = + std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; + Rows shifted_dst{ + &dst_rows[0] + static_cast(w) * dst_rows.channels(), + dst_rows.stride(), dst_rows.channels()}; + + clear_patched_histogram(total_patch_span * src_rows.channels()); + + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + // We initialize with ksize rows to allow merging of + // histogram increment and decrement operations in the main loop. + // This extra initial load enables a single update phase and avoids + // splitting the logic into separate steps. + for (size_t r = 0; r < ksize; ++r) { + const ptrdiff_t valid_h = + get_physical_index(starting_coordinates.y() + r - kMargin, + image_dimensions.height(), border_type); + initialize_patched_histogram_with_horizontal_borders( + src_rows, c, valid_h, total_patch_span, w, kMargin, + image_dimensions.width(), border_type); + } + compute_patch_median_from_histogram(starting_coordinates.y(), c, + total_patch_span, kMargin, ksize, + shifted_dst); + } + + for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); + ++h) { + const ptrdiff_t valid_old_h = get_physical_index( + h - kMargin - 1, image_dimensions.height(), border_type); + const ptrdiff_t valid_new_h = get_physical_index( + h + kMargin, image_dimensions.height(), border_type); + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + + update_patch_histogram_with_horizontal_borders( + src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, + kMargin, image_dimensions.width(), border_type); + + compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, + ksize, shifted_dst); + } + } + } + } + + private: + // Histogram Buffer Layout Explanation + // ----------------------------------- + // patched_coarse: + // - Conceptually a 3D array: + // coarse[channel_idx][patch_offset][coarse_bin] + // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] + // - Flattened as: + // coarse_offset = 16 * (patch_length * channel_idx + patch_offset) + // + (incoming_pixel >> 4); + // + // patched_fine: + // - Conceptually a 4D array: + // fine[channel_idx][coarse_bin][patch_offset][fine_bin] + // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] + // - fine_bin = incoming_pixel & 0xF ∈ [0, 15] + // - Flattened as: + // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin) + // + patch_offset) + fine_bin; + // + // This layout enables fast linear access while preserving the hierarchical + // structure of histograms per channel and patch position. + constexpr static size_t patch_length = 512; + uint16_t* patched_coarse; + uint16_t* patched_fine; + + // Clear only the portion of the patched histogram buffers that will be used + // for the current patch. + // Since these buffers are large, there's no need to zero out the entire + // allocation— only the section relevant to the current total patch size is + // cleared for efficiency. + void clear_patched_histogram(size_t total_patch_size) { + std::memset(patched_coarse, 0, + 16 * total_patch_size * sizeof(patched_coarse[0])); + std::memset(patched_fine, 0, + 256 * total_patch_size * sizeof(patched_fine[0])); + } + + void scalar_initialize_patched_histogram(int incoming_pixel, + size_t channel_idx, + size_t patch_length, + size_t patch_offset) { + const size_t coarse_offset = + 16 * (patch_length * channel_idx + patch_offset) + + (incoming_pixel >> 4); + const size_t fine_offset = + 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) + + patch_offset) + + (incoming_pixel & 0xF); + patched_coarse[coarse_offset]++; + patched_fine[fine_offset]++; + } + + void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels, + size_t channel_idx, + size_t patch_length, + size_t patch_offset) { + KLEIDICV_FORCE_LOOP_UNROLL + for (int i = 0; i < 16; i++) { + const size_t coarse_offset_incoming = + 16 * (patch_length * channel_idx + patch_offset + i) + + (incoming_pixels[i] >> 4); + + const size_t fine_offset_incoming = + 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + + patch_offset + i) + + (incoming_pixels[i] & 0xF); + + patched_coarse[coarse_offset_incoming]++; + patched_fine[fine_offset_incoming]++; + } + } + + void initialize_patched_histogram_without_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_h, + size_t total_patch_span, size_t starting_width, size_t kMargin) { + size_t vector_part = 0; + if constexpr (is_single_channel) { + vector_part = (total_patch_span >> 4) << 4; + for (size_t patch_offset = 0; patch_offset < vector_part; + patch_offset += 16) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]); + vector_initialize_patched_histogram(incoming_pixels, c, + total_patch_span, patch_offset); + } + } + + for (size_t patch_offset = vector_part; patch_offset < total_patch_span; + ++patch_offset) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; + + scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, + patch_offset); + } + } + + void initialize_patched_histogram_with_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_h, + size_t total_patch_span, size_t starting_width, size_t kMargin, + size_t width, FixedBorderType border_type) { + for (size_t patch_offset = 0; patch_offset < total_patch_span; + ++patch_offset) { + ptrdiff_t valid_w = get_physical_index( + starting_width + patch_offset - kMargin, width, border_type); + + auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; + + scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, + patch_offset); + } + } + + // During vertical traversal (the main 'height' loop), each sliding window + // iteration introduces a new incoming row and removes an outgoing one. The + // histogram must be updated accordingly by subtracting the contributions of + // the outgoing row and adding those of the incoming row. + // Both increment and decrement operations are handled inside the same + // function. + void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel, + size_t channel_idx, size_t patch_length, + size_t patch_offset) { + const size_t coarse_offset_base = + 16 * (patch_length * channel_idx + patch_offset); + const size_t fine_offset_base = + 16 * patch_length * 16 * channel_idx + 16 * patch_offset; + + const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4); + const size_t mask_new_pixel = (incoming_pixel & 0xF); + const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4); + const size_t mask_old_pixel = (outgoing_pixel & 0xF); + + const size_t fine_new_offset = fine_offset_base + mask_new_pixel + + 16 * patch_length * pixel_new_shift_right_4; + const size_t coarse_new_offset = + coarse_offset_base + pixel_new_shift_right_4; + + const size_t fine_old_offset = fine_offset_base + mask_old_pixel + + 16 * patch_length * pixel_old_shift_right_4; + const size_t coarse_old_offset = + coarse_offset_base + pixel_old_shift_right_4; + + patched_coarse[coarse_new_offset]++; + patched_coarse[coarse_old_offset]--; + patched_fine[fine_new_offset]++; + patched_fine[fine_old_offset]--; + } + + void vector_update_patch_histogram(uint8x16_t& outgoing_pixels, + uint8x16_t& incoming_pixels, + size_t channel_idx, size_t patch_length, + size_t patch_offset) { + KLEIDICV_FORCE_LOOP_UNROLL + for (int i = 0; i < 16; i++) { + const size_t coarse_offset_incoming = + 16 * (patch_length * channel_idx + patch_offset + i) + + (incoming_pixels[i] >> 4); + + const size_t coarse_offset_outgoing = + 16 * (patch_length * channel_idx + patch_offset + i) + + (outgoing_pixels[i] >> 4); + + const size_t fine_offset_incoming = + 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + + patch_offset + i) + + (incoming_pixels[i] & 0xF); + + const size_t fine_offset_outgoing = + 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) + + patch_offset + i) + + (outgoing_pixels[i] & 0xF); + + patched_coarse[coarse_offset_incoming]++; + patched_coarse[coarse_offset_outgoing]--; + patched_fine[fine_offset_incoming]++; + patched_fine[fine_offset_outgoing]--; + } + } + + void update_patch_histogram_without_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, + ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, + size_t kMargin) { + size_t vector_part = 0; + if constexpr (is_single_channel) { + vector_part = (total_patch_span >> 4) << 4; + for (size_t patch_offset = 0; patch_offset < vector_part; + patch_offset += 16) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]); + auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]); + vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c, + total_patch_span, patch_offset); + } + } + + for (size_t patch_offset = vector_part; patch_offset < total_patch_span; + ++patch_offset) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; + auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; + scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, + total_patch_span, patch_offset); + } + } + + void update_patch_histogram_with_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, + ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, + size_t kMargin, size_t width, FixedBorderType border_type) { + for (size_t patch_offset = 0; patch_offset < total_patch_span; + ++patch_offset) { + const ptrdiff_t valid_w = get_physical_index( + starting_width + patch_offset - kMargin, width, border_type); + auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; + auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; + scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, + total_patch_span, patch_offset); + } + } + + // `H` and `luc` are used for computing the median value for a single + // output pixel: + // - `H` is a histogram structure holding both the coarse and fine + // bins needed + // for the current element's median calculation. + // - `luc` is a lookup table that stores the last processed offset + // (index) + // for each coarse bin. This allows incremental fine histogram + // updates instead of full recalculation when histogram overlap is + // high. Since neighboring patches in natural images often have + // similar pixel values, reusing previous histogram state can + // significantly reduce processing time. + typedef struct { + uint16_t coarse[16]; + uint16_t fine[16][16]; + } Histogram; + + Histogram H; + uint16_t luc[16]; + + void clear_lookup_table(void) { + std::memset(&H, 0, sizeof(Histogram)); + std::memset(luc, 0, 16 * sizeof(uint16_t)); + } + + void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel, + uint16x8_t& v_coarseh, size_t kMargin) { + for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) { + v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); + v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); + } + } + + void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel, + uint16x8_t& v_coarseh) { + v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); + v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); + vst1q_u16(H.coarse, v_coarsel); + vst1q_u16(H.coarse + 8, v_coarseh); + } + + void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel, + uint16x8_t& v_coarseh) { + v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px)); + v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8)); + } + + void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, + uint16_t& luc, size_t patch_offset, size_t kMargin, + size_t total_patch_span) { + for (luc = static_cast(patch_offset - kMargin); + luc < static_cast( + std::min(patch_offset + kMargin + 1, total_patch_span)); + ++luc, px += 16) { + v_finel = vaddq_u16(v_finel, vld1q_u16(px)); + v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8)); + } + } + + void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, + uint16_t& luc, size_t patch_offset, size_t kMargin, + size_t total_patch_span) { + for (; luc < static_cast(patch_offset + kMargin + 1); ++luc) { + constexpr ptrdiff_t stride = 16; + const ptrdiff_t patch_span_limit = + static_cast(total_patch_span - 1); + const ptrdiff_t safe_luc = static_cast(luc); + + const ptrdiff_t base_offset = + stride * std::min(safe_luc, patch_span_limit); + const ptrdiff_t old_offset = + stride * std::max(safe_luc - static_cast(2 * kMargin + 1), + ptrdiff_t{0}); + const uint16x8_t new_vecl = vld1q_u16(px + base_offset); + const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8); + const uint16x8_t old_vecl = vld1q_u16(px + old_offset); + const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8); + v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl); + v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech); + } + } + + size_t find_coarse_index(size_t& cdf, size_t median_index) { + size_t coarse_index = 0; + while (true) { + cdf += H.coarse[coarse_index]; + if (cdf > median_index) { + cdf -= H.coarse[coarse_index]; + break; + } + coarse_index++; + } + return coarse_index; + } + + uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) { + uint16_t* segment = H.fine[coarse_index]; + size_t fine_index = 0; + while (true) { + cdf += segment[fine_index]; + if (cdf > median_index) { + fine_index = (16 * coarse_index + fine_index); + break; + } + fine_index++; + } + return uint8_t(fine_index); + } + + void compute_patch_median_from_histogram(size_t h, ptrdiff_t c, + size_t total_patch_span, + size_t kMargin, size_t ksize, + Rows dst) { + uint16x8_t v_coarsel = vld1q_u16(H.coarse); + uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8); + + // Before starting the main patch loop to compute medians for each element, + // we initialize the coarse histogram buffer with the first (ksize - 1). + // This allows each subsequent iteration in the patch loop to perform only + // one addition and one subtraction. + uint16_t* px = patched_coarse + 16 * total_patch_span * c; + initialize_coarse(px, v_coarsel, v_coarseh, kMargin); + + for (size_t patch_offset = kMargin; + patch_offset < total_patch_span - kMargin; patch_offset++) { + size_t median_index = (ksize * ksize) / 2, cdf = 0; + + px = patched_coarse + + 16 * (total_patch_span * c + + std::min(patch_offset + kMargin, total_patch_span - 1)); + increment_coarse(px, v_coarsel, v_coarseh); + + // Find median at coarse level + size_t coarse_index = find_coarse_index(cdf, median_index); + + uint16x8_t v_finel; + uint16x8_t v_fineh; + // Check whether the fine histogram (H.fine[coarse_index]) for the + // current patch position needs to be freshly initialized or can be + // incrementally updated. This decision hinges on the `luc` (Last Used + // Coordinate) table, which records the last horizontal patch offset + // processed for each coarse bin. + // + // The condition is true in two scenarios: + // + // 1. **First-Time Initialization**: + // - This is the first time we are accessing this coarse bin + // (`coarse_index`) at the current patch position. + // - We compute the full fine histogram from scratch by summing + // over `ksize` rows centered at the patch position. + // - We accumulate the results into the `v_finel` and `v_fineh` + // vector registers. + // - These vectors are then stored into `H.fine[coarse_index]`. + // - The `luc` table is updated to reflect the last index + // processed. + // + // 2. **Window Movement Causes Loss of Overlap**: + // - The sliding window has moved enough that it no longer + // sufficiently overlaps the region + // used to compute the previously cached fine histogram (i.e., + // `luc[coarse_index]` is too far behind). + // - We must reinitialize the fine histogram to ensure accuracy. + // + // Otherwise: + // + // - We reuse the previously computed fine histogram stored in + // `H.fine[coarse_index]`. + // - We only update it incrementally using the `update_fine()` + // function, which: + // - Adds the new values entering the window. + // - Subtracts the values leaving the window. + // - This avoids the need for a full re-scan, leveraging temporal + // locality between neighboring pixels. + // + // This lookup-based optimization significantly improves performance, + // as neighboring filter windows often overlap heavily—especially for + // small strides and moderate kernel sizes. + // + // After this step, the fine histogram is ready. The next phase scans + // `H.fine[coarse_index]` to identify the fine bin where the + // cumulative sum crosses the median threshold. This gives us the + // final median value for the output pixel. + if (luc[coarse_index] <= patch_offset - kMargin) { + v_finel = vdupq_n_u16(0); + v_fineh = vdupq_n_u16(0); + px = patched_fine + static_cast(16) * + (static_cast(total_patch_span) * + (16 * c + coarse_index) + + patch_offset - kMargin); + + initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, + kMargin, total_patch_span); + + } else { + v_finel = vld1q_u16(H.fine[coarse_index]); + v_fineh = vld1q_u16(H.fine[coarse_index] + 8); + px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index); + update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, + kMargin, total_patch_span); + } + + px = patched_coarse + + static_cast(16) * + (total_patch_span * c + + std::max(patch_offset - kMargin, static_cast(0))); + + vst1q_u16(H.fine[coarse_index], v_finel); + vst1q_u16(H.fine[coarse_index] + 8, v_fineh); + + decrement_coarse(px, v_coarsel, v_coarseh); + + // Find median at fine level + dst.at(static_cast(h), + static_cast(patch_offset - kMargin))[c] = + find_median(cdf, median_index, coarse_index); + } + } +}; + +template +void median_process(Rectangle image_dimensions, Rows src_rows, + Rows dst_rows, size_t y_begin, size_t y_end, + size_t kernel_width, size_t kernel_height, + FixedBorderType border_type) { + size_t kMargin = (kernel_width - 1) / 2; + MedianBlurLargeHist median_filter{src_rows.channels(), + kMargin}; + + // Process left border + size_t starting_width = 0; + size_t processing_left_width = kMargin; + Point starting_left_coordinates{starting_width, y_begin}; + Point ending_left_coordinates{starting_width + processing_left_width, y_end}; + median_filter.process_pixels_with_horizontal_borders( + image_dimensions, starting_left_coordinates, ending_left_coordinates, + src_rows, dst_rows, kernel_height, border_type); + + // Process center region + starting_width = processing_left_width; + size_t processing_center_width = image_dimensions.width() - 2 * kMargin; + Point starting_center_coordinates{starting_width, y_begin}; + Point ending_center_coordinates{starting_width + processing_center_width, + y_end}; + median_filter.process_pixels_without_horizontal_borders( + image_dimensions, starting_center_coordinates, ending_center_coordinates, + src_rows, dst_rows, kernel_height, border_type); + + // Process right border + starting_width = processing_left_width + processing_center_width; + size_t processing_right_width = image_dimensions.width() - + processing_left_width - + processing_center_width; + Point starting_right_coordinates{starting_width, y_begin}; + Point ending_right_coordinates{starting_width + processing_right_width, + y_end}; + median_filter.process_pixels_with_horizontal_borders( + image_dimensions, starting_right_coordinates, ending_right_coordinates, + src_rows, dst_rows, kernel_height, border_type); +} + +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8( + const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { + Rectangle image_dimensions{width, height}; + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + + if (channels == 1) { + median_process(image_dimensions, src_rows, dst_rows, y_begin, y_end, + kernel_width, kernel_height, border_type); + } else { + median_process(image_dimensions, src_rows, dst_rows, y_begin, y_end, + kernel_width, kernel_height, border_type); + } + + return KLEIDICV_OK; +} + +} // namespace kleidicv::neon diff --git a/kleidicv/src/filters/median_blur_small_hist_neon.cpp b/kleidicv/src/filters/median_blur_small_hist_neon.cpp index 3c14f6eb0a5e225fd3022f96f051271c6ea4ab6e..32ab921a8561233e82d5afcd77b6d40d46e6e7b7 100644 --- a/kleidicv/src/filters/median_blur_small_hist_neon.cpp +++ b/kleidicv/src/filters/median_blur_small_hist_neon.cpp @@ -6,50 +6,10 @@ #include "kleidicv/filters/median_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "median_blur_border_handling.h" namespace kleidicv::neon { -static ptrdiff_t get_physical_index(size_t index, size_t limit, - FixedBorderType border_type) { - int result = 0; - int signed_index = static_cast(index); - int signed_limit = static_cast(limit); - - if (signed_index >= 0 && signed_index < signed_limit) { - return static_cast(index); - } - switch (border_type) { - case FixedBorderType::REPLICATE: { - result = std::clamp(signed_index, 0, signed_limit - 1); - break; - } - case FixedBorderType::REFLECT: { - if (signed_index < 0) { - result = -signed_index - 1; - } else { - result = 2 * signed_limit - signed_index - 1; - } - break; - } - - case FixedBorderType::WRAP: { - result = (signed_index + signed_limit) % signed_limit; - break; - } - - case FixedBorderType::REVERSE: { - if (signed_index < 0) { - result = std::min(-signed_index, signed_limit - 1); - } else { - result = 2 * signed_limit - signed_index - 2; - } - break; - } - } - - return static_cast(result); -} - // B. Weiss, "Fast Median and Bilateral Filtering," in *ACM SIGGRAPH 2006 // Papers*, ACM, New York, NY, USA, pp. 519–526, 2006. // The paper is currently available at: @@ -62,7 +22,7 @@ class MedianBlurSmallHist { Rectangle image_dimensions, Point starting_coordinates, Point ending_coordinates, Rows src_rows, Rows dst_rows, size_t ksize, FixedBorderType border_type) { - const size_t KMargin = ksize / 2; + const size_t kMargin = ksize / 2; for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w++) { for (ptrdiff_t ch = 0; ch < static_cast(src_rows.channels()); @@ -76,10 +36,10 @@ class MedianBlurSmallHist { for (size_t r = 0; r < ksize; r++) { for (size_t c = 0; c < ksize; c++) { const ptrdiff_t valid_h = - get_physical_index(starting_coordinates.y() + r - KMargin, + get_physical_index(starting_coordinates.y() + r - kMargin, image_dimensions.height(), border_type); const ptrdiff_t valid_w = get_physical_index( - w + c - KMargin, image_dimensions.width(), border_type); + w + c - kMargin, image_dimensions.width(), border_type); uint8_t pixel = src_rows.at(valid_h, valid_w)[ch]; @@ -95,14 +55,14 @@ class MedianBlurSmallHist { for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); h++) { const ptrdiff_t valid_new_h = get_physical_index( - h + KMargin, image_dimensions.height(), border_type); + h + kMargin, image_dimensions.height(), border_type); const ptrdiff_t valid_old_h = get_physical_index( - h - KMargin - 1, image_dimensions.height(), border_type); + h - kMargin - 1, image_dimensions.height(), border_type); for (size_t c = 0; c < ksize; c++) { const ptrdiff_t valid_w = get_physical_index( - w + c - KMargin, image_dimensions.width(), border_type); + w + c - kMargin, image_dimensions.width(), border_type); uint8_t incoming_pixel = src_rows.at(valid_new_h, valid_w)[ch]; @@ -382,7 +342,7 @@ class MedianBlurSmallHist { } }; -kleidicv_error_t median_blur_small_hist_stripe_u8( +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_small_hist_stripe_u8( const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { @@ -390,11 +350,11 @@ kleidicv_error_t median_blur_small_hist_stripe_u8( Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; MedianBlurSmallHist median_filter; - const size_t KMargin = kernel_width / 2; + const size_t kMargin = kernel_width / 2; // Process left border size_t starting_width = 0; - const size_t processing_left_width = KMargin; + const size_t processing_left_width = kMargin; Point starting_left_coordinates{starting_width, y_begin}; Point ending_left_coordinates{starting_width + processing_left_width, y_end}; @@ -405,12 +365,12 @@ kleidicv_error_t median_blur_small_hist_stripe_u8( // Process center region starting_width = processing_left_width; // Compute the width of the center region that can be processed with NEON - // instructions. Subtract 2 * KMargin to exclude left and right borders, which + // instructions. Subtract 2 * kMargin to exclude left and right borders, which // are handled separately using scalar code due to varying border modes (e.g., // REPLICATE, REFLECT, WRAP, REVERSE). Align the remaining width down to the // nearest multiple of 16 to match NEON's 128-bit register width (16 bytes for // uint8x16_t). - const size_t processing_center_width = ((width - 2 * KMargin) / 16) * 16; + const size_t processing_center_width = ((width - 2 * kMargin) / 16) * 16; Point starting_center_coordinates{starting_width * channels, y_begin}; Point ending_center_coordinates{ (processing_center_width + starting_width) * channels, y_end}; diff --git a/kleidicv/src/filters/median_blur_sorting_network_neon.cpp b/kleidicv/src/filters/median_blur_sorting_network_neon.cpp index e46e37c9a4da93778e00ef2bb6c01d577b5dad42..645cb2ea646e701ecba8599e28e36d5b665d0f1e 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_neon.cpp +++ b/kleidicv/src/filters/median_blur_sorting_network_neon.cpp @@ -187,6 +187,7 @@ kleidicv_error_t median_blur_sorting_network_stripe( border_type, filter); return KLEIDICV_OK; } + if (kernel_width == 5) { MedianBlurSortingNetwork median_filter; Filter2D5x5> filter{median_filter}; @@ -194,15 +195,12 @@ kleidicv_error_t median_blur_sorting_network_stripe( filter); return KLEIDICV_OK; } - if (kernel_width == 7) { - MedianBlurSortingNetwork median_filter; - Filter2D7x7> filter{median_filter}; - process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, - filter); - return KLEIDICV_OK; - } - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + MedianBlurSortingNetwork median_filter; + Filter2D7x7> filter{median_filter}; + process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, + filter); + return KLEIDICV_OK; } #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ diff --git a/kleidicv/src/filters/median_blur_sorting_network_sc.h b/kleidicv/src/filters/median_blur_sorting_network_sc.h index 895b363c3160381c914180049d3a6675d2da0182..f8496b3e8df1bccc92be3a9965036120844bcae6 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_sc.h +++ b/kleidicv/src/filters/median_blur_sorting_network_sc.h @@ -144,6 +144,7 @@ kleidicv_error_t median_blur_sorting_network_stripe_sc( border_type, filter); return KLEIDICV_OK; } + if (kernel_width == 5) { MedianBlurSortingNetwork median_filter; Filter2D5x5> filter{median_filter}; @@ -151,15 +152,12 @@ kleidicv_error_t median_blur_sorting_network_stripe_sc( filter); return KLEIDICV_OK; } - if (kernel_width == 7) { - MedianBlurSortingNetwork median_filter; - Filter2D7x7> filter{median_filter}; - process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, - filter); - return KLEIDICV_OK; - } - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + MedianBlurSortingNetwork median_filter; + Filter2D7x7> filter{median_filter}; + process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, + filter); + return KLEIDICV_OK; } } // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 3d5ddb041a100bc03d07bcd67a0f8b0343e5ce8e..45a49ef0d10d6d3f720f9363f7c51c4e10bd5992 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -593,7 +593,16 @@ kleidicv_error_t kleidicv_thread_median_blur_u8( return checks_result; } - if (kernel_width > 7) { + if (kernel_width <= 7) { + auto callback = [=](unsigned y_begin, unsigned y_end) { + return kleidicv_median_blur_sorting_network_stripe_u8( + src, src_stride, dst, dst_stride, width, height, y_begin, y_end, + channels, kernel_width, kernel_height, fixed_border_type); + }; + return parallel_batches(callback, mt, height); + } + + if (kernel_width > 7 && kernel_width <= 15) { auto callback = [=](unsigned y_begin, unsigned y_end) { return kleidicv_median_blur_small_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, y_begin, y_end, @@ -603,7 +612,7 @@ kleidicv_error_t kleidicv_thread_median_blur_u8( } auto callback = [=](unsigned y_begin, unsigned y_end) { - return kleidicv_median_blur_sorting_network_stripe_u8( + return kleidicv_median_blur_large_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, kernel_width, kernel_height, fixed_border_type); }; diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 3b22f42f24b17b648770d8869cfcd4a64b782414..4d8f8dc5523454a901965efe9c18cf7f9605e6a9 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -38,6 +38,9 @@ MedianBlur9x9: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, MedianBlur11x11: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 11)' MedianBlur13x13: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 13)' MedianBlur15x15: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 15)' +MedianBlur17x17: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 17)' +MedianBlur27x27: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 27)' +MedianBlur35x35: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 35)' GaussianBlur3x3: opencv_perf_imgproc '*gaussianBlur3x3/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur5x5: opencv_perf_imgproc '*gaussianBlur5x5/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' diff --git a/test/api/test_median_blur.cpp b/test/api/test_median_blur.cpp index c54ba43e719cfa8931fd43546e059f360c10a43e..7f3326b2c3c6f04482c1686a299bca722c3ab48a 100644 --- a/test/api/test_median_blur.cpp +++ b/test/api/test_median_blur.cpp @@ -65,13 +65,13 @@ class MedianBlurTest : public testing::Test { return cases; } - static std::vector get_unpadded_test_cases() { - std::vector widths = {50}; + static std::vector get_unpadded_test_cases(size_t filter_size) { + std::vector widths = {2 * filter_size + 16}; std::vector src_paddings = {0}; std::vector dst_paddings = {0}; - std::vector heights = {30}; - std::vector channels = {1, 4}; - std::vector filter_sizes = {3, 5, 7}; + std::vector heights = {2 * filter_size + 16}; + std::vector channels = {1, 2, 3, 4}; + std::vector filter_sizes = {filter_size}; std::vector border_types = { KLEIDICV_BORDER_TYPE_REPLICATE, KLEIDICV_BORDER_TYPE_REFLECT, KLEIDICV_BORDER_TYPE_WRAP, KLEIDICV_BORDER_TYPE_REVERSE}; @@ -80,13 +80,13 @@ class MedianBlurTest : public testing::Test { channels, filter_sizes, border_types); } - static std::vector get_padded_test_cases() { - std::vector widths = {20}; + static std::vector get_padded_test_cases(size_t filter_size) { + std::vector widths = {2 * filter_size + 16}; std::vector src_paddings = {5}; std::vector dst_paddings = {13}; - std::vector heights = {10}; - std::vector channels = {1, 4}; - std::vector filter_sizes = {3, 5, 7}; + std::vector heights = {2 * filter_size + 16}; + std::vector channels = {1, 2, 3, 4}; + std::vector filter_sizes = {filter_size}; std::vector border_types = { KLEIDICV_BORDER_TYPE_REPLICATE}; @@ -94,7 +94,7 @@ class MedianBlurTest : public testing::Test { channels, filter_sizes, border_types); } - static std::vector get_small_image_test_cases( + static std::vector get_small_range_filter_test_cases( size_t filter_size) { std::vector widths = {25, filter_size - 1}; std::vector src_paddings = {0}; @@ -124,6 +124,21 @@ class MedianBlurTest : public testing::Test { channels, filter_sizes, border_types); } + static std::vector get_large_range_filter_test_cases( + size_t filter_size) { + std::vector widths = {60}; + std::vector src_paddings = {0}; + std::vector dst_paddings = {5}; + std::vector heights = {60, filter_size - 1}; + std::vector channels = {1, 3}; + std::vector filter_sizes = {filter_size}; + std::vector border_types = { + KLEIDICV_BORDER_TYPE_REPLICATE, KLEIDICV_BORDER_TYPE_REFLECT, + KLEIDICV_BORDER_TYPE_WRAP, KLEIDICV_BORDER_TYPE_REVERSE}; + return generate_test_cases(widths, src_paddings, dst_paddings, heights, + channels, filter_sizes, border_types); + } + void run_test_case(const TestParams& params) { test::Array2D src{params.width * params.channels, params.height, params.src_padding, @@ -233,21 +248,42 @@ using ElementTypes = ::testing::Types; TYPED_TEST_SUITE(MedianBlurTest, ElementTypes); -TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithoutPadding) { +TYPED_TEST(MedianBlurTest, + RunAllParamCombinationsWithoutPaddingWithSmallFilterSize) { + if (test::Options::are_long_running_tests_skipped()) { + GTEST_SKIP() + << "Long running test " + "MedianBlurTest::" + "RunAllParamCombinationsWithoutPaddingWithSmallFilterSize skipped"; + } + + for (auto ksize : {3, 5, 7}) { + for (const auto& params : TestFixture::get_unpadded_test_cases(ksize)) { + this->run_test_case(params); + } + } +} + +TYPED_TEST(MedianBlurTest, + RunAllParamCombinationsWithPaddingWithSmallFilterSize) { if (test::Options::are_long_running_tests_skipped()) { GTEST_SKIP() << "Long running test " - "MedianBlurTest::RunAllParamCombinationsWithoutPadding " + "MedianBlurTest::" + "RunAllParamCombinationsWithPaddingWithSmallFilterSize " "skipped"; } - for (const auto& params : TestFixture::get_unpadded_test_cases()) { - this->run_test_case(params); + for (auto ksize : {3, 5, 7}) { + for (const auto& params : TestFixture::get_padded_test_cases(ksize)) { + this->run_test_case(params); + } } } -TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithSmallImageSize) { +TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithSmallImageAndFilterSize) { for (auto ksize : {3, 5, 7}) { - for (const auto& params : TestFixture::get_small_image_test_cases(ksize)) { + for (const auto& params : + TestFixture::get_small_range_filter_test_cases(ksize)) { this->run_test_case(params); } } @@ -324,62 +360,53 @@ TYPED_TEST(MedianBlurTest, OversizeImage) { } TYPED_TEST(MedianBlurTest, UnsupportedFilterSizes) { - test::Array2D src{100, 100}; - test::Array2D dst{100, 100}; + test::Array2D src{1000, 1000}; + test::Array2D dst{1000, 1000}; // Test unsupported large square filter EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 100, 100, + dst.stride(), 1000, 1000, 1, 257, 257, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test non-square filter with valid height EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 100, 5, + dst.stride(), 1000, 1000, 1, 100, 5, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test non-square filter with valid width EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 5, 100, + dst.stride(), 1000, 1000, 1, 5, 100, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test unsupported small filter EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 1, 1, + dst.stride(), 1000, 1000, 1, 1, 1, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test unsupported even filter EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 4, 4, + dst.stride(), 1000, 1000, 1, 4, 4, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test mid-range square filters that are not implemented EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 9, 9, + dst.stride(), 1000, 1000, 1, 9, 9, KLEIDICV_BORDER_TYPE_TRANSPARENT)); if (!std::is_same_v) { EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 9, 9, + dst.stride(), 1000, 1000, 1, 9, 9, KLEIDICV_BORDER_TYPE_REPLICATE)); } } -TYPED_TEST(MedianBlurTest, NonSquareFilterSizeWithValidWidth) { - test::Array2D src{100, 100}; - test::Array2D dst{100, 100}; - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 5, 100, - KLEIDICV_BORDER_TYPE_REPLICATE)); -} - TYPED_TEST(MedianBlurTest, SrcDstChannelCombinations) { using ElementType = TypeParam; constexpr size_t width = 10; @@ -426,32 +453,57 @@ TYPED_TEST(MedianBlurTest, SrcDstChannelCombinations) { EXPECT_EQ(KLEIDICV_OK, status); } } - template -class MedianBlurByteStrideTest : public MedianBlurTest {}; +class MedianBlurMidAndLargeRangeTest : public MedianBlurTest {}; +using ByteType = ::testing::Types; +TYPED_TEST_SUITE(MedianBlurMidAndLargeRangeTest, ByteType); +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithSmallImageAndMidRangeFilterSize) { + for (const auto& params : TestFixture::get_mid_range_filter_test_cases()) { + this->run_test_case(params); + } +} -using ByteStrideTypes = ::testing::Types; -TYPED_TEST_SUITE(MedianBlurByteStrideTest, ByteStrideTypes); +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithSmallImageAndLargeRangeFilterSize) { + for (auto ksize : {17, 35}) { + for (const auto& params : + TestFixture::get_large_range_filter_test_cases(ksize)) { + this->run_test_case(params); + } + } +} -TYPED_TEST(MedianBlurByteStrideTest, RunAllParamCombinationsWithPadding) { +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithoutPaddingWithMidAndLargeFilterSize) { if (test::Options::are_long_running_tests_skipped()) { GTEST_SKIP() << "Long running test " - "MedianBlurByteStrideTest::RunAllParamCombinationsWithPadding " + "MedianBlurMidAndLargeRangeTest::" + "RunAllParamCombinationsWithoutPaddingWithMidAndLargeFilterSize " "skipped"; } - for (const auto& params : TestFixture::get_padded_test_cases()) { - this->run_test_case(params); + for (auto ksize : {9, 15, 17, 255}) { + for (const auto& params : TestFixture::get_unpadded_test_cases(ksize)) { + this->run_test_case(params); + } } } -template -class MedianBlurMidRangeTest : public MedianBlurTest {}; -using ByteType = ::testing::Types; -TYPED_TEST_SUITE(MedianBlurMidRangeTest, ByteType); -TYPED_TEST(MedianBlurMidRangeTest, RunAllParamCombinationsWithMidRangeFilters) { - for (const auto& params : TestFixture::get_mid_range_filter_test_cases()) { - this->run_test_case(params); +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithPaddingWithMidAndLargeFilterSize) { + if (test::Options::are_long_running_tests_skipped()) { + GTEST_SKIP() + << "Long running test " + "MedianBlurMidAndLargeRangeTest::" + "RunAllParamCombinationsWithPaddingWithMidAndLargeFilterSize " + "skipped"; + } + + for (auto ksize : {9, 15, 17, 255}) { + for (const auto& params : TestFixture::get_padded_test_cases(ksize)) { + this->run_test_case(params); + } } } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index f93d8b8ccfdf9caf692808c2a1f36fd3514ea8b8..03649e374a124c2a992f5bc437d17449aaf62a36 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -113,7 +113,7 @@ class Thread : public testing::TestWithParam

{ (void)thread_count; size_t channels = 1; kleidicv_border_type_t border_type = KLEIDICV_BORDER_TYPE_REPLICATE; - const auto &filter_size = std::vector{3, 5, 7, 9}; + const auto &filter_size = std::vector{3, 5, 7, 9, 17}; for (auto ksize : filter_size) { check_unary_op(single_threaded_func, multithreaded_func, channels, channels, channels, ksize, ksize, border_type); @@ -957,7 +957,7 @@ INSTANTIATE_TEST_SUITE_P( P{1, 7, 4}, P{12, 34, 5}, P{1, 16, 1}, P{1, 32, 1}, P{1, 32, 2}, P{2, 16, 2}, P{2, 32, 1}, P{1, 48, 2}, P{2, 48, 1}, P{6, 64, 1}, P{4, 80, 2}, P{2, 96, 3}, - P{1, 112, 4}, P{12, 34, 5})); + P{1, 112, 4}, P{12, 34, 5}, P{40, 34, 5})); TEST(ThreadScaleU8, NotImplemented) { test::Array2D src(size_t{1}, 1), dst(size_t{1}, 1);