From afc6b2201407ffe93275f7c1bb07a9d9734bf000 Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Wed, 2 Jul 2025 17:15:08 +0100 Subject: [PATCH 1/9] Add Median Blur generic NEON version --- CHANGELOG.md | 2 +- benchmark/benchmark.cpp | 10 +- conformity/opencv/test_median_blur.cpp | 34 +- doc/functionality.md | 18 +- doc/opencv.md | 6 +- .../include/kleidicv/filters/median_blur.h | 17 +- kleidicv/include/kleidicv/kleidicv.h | 8 +- kleidicv/src/filters/median_blur_api.cpp | 14 +- .../src/filters/median_blur_border_handling.h | 59 ++ .../filters/median_blur_large_hist_neon.cpp | 670 ++++++++++++++++++ .../filters/median_blur_small_hist_neon.cpp | 62 +- kleidicv_thread/src/kleidicv_thread.cpp | 13 +- scripts/benchmark/benchmarks.txt | 3 + test/api/test_median_blur.cpp | 125 +++- test/api/test_thread.cpp | 2 +- 15 files changed, 930 insertions(+), 113 deletions(-) create mode 100644 kleidicv/src/filters/median_blur_border_handling.h create mode 100644 kleidicv/src/filters/median_blur_large_hist_neon.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 935e95a48..71c3cee7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This changelog aims to follow the guiding principles of ### Added - Median Blur for 3x3 kernels. -- Median Blur for 9x9, 11x11, 13x13 and 15x15 kernels, Neon backend only. +- Median Blur for generic kernels, Neon backend only. ### Changed - Performance of Gaussian Blur is greatly improved in return for some accuracy. diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 4d35a0111..75d9cf544 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -616,7 +616,10 @@ BENCHMARK_TEMPLATE2_CAPTURE(median_blur, uint8_t, 1, , kleidicv_median_blur_u8) ->Arg(9) ->Arg(11) ->Arg(13) - ->Arg(15); + ->Arg(15) + ->Arg(17) + ->Arg(27) + ->Arg(35); BENCHMARK_TEMPLATE2_CAPTURE(median_blur, uint8_t, 4, , kleidicv_median_blur_u8) ->Arg(3) ->Arg(5) @@ -624,7 +627,10 @@ BENCHMARK_TEMPLATE2_CAPTURE(median_blur, uint8_t, 4, , kleidicv_median_blur_u8) ->Arg(9) ->Arg(11) ->Arg(13) - ->Arg(15); + ->Arg(15) + ->Arg(17) + ->Arg(27) + ->Arg(35); BENCHMARK_TEMPLATE2_CAPTURE(median_blur, int8_t, 1, , kleidicv_median_blur_s8) ->Arg(3) diff --git a/conformity/opencv/test_median_blur.cpp b/conformity/opencv/test_median_blur.cpp index 85b78bc29..f9b9bffb1 100644 --- a/conformity/opencv/test_median_blur.cpp +++ b/conformity/opencv/test_median_blur.cpp @@ -19,9 +19,23 @@ bool test_median_blur(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { cv::RNG rng(0); - for (size_t x = 5; x <= 16; ++x) { - for (size_t y = 5; y <= 16; ++y) { - cv::Mat input(x, y, get_opencv_matrix_type()); + size_t size_min{0}; + size_t size_max{0}; + size_t step{0}; + + if constexpr (KernelSize <= 15) { + size_min = KernelSize - 1; + size_max = 2 * KernelSize + 16; + step = 1; + } else { + size_min = KernelSize - 1; + size_max = KernelSize + 16; + step = 16; + } + + for (size_t w = size_min; w <= size_max; w += step) { + for (size_t h = size_min; h <= size_max; h += step) { + cv::Mat input(w, h, get_opencv_matrix_type()); rng.fill(input, cv::RNG::UNIFORM, std::numeric_limits::min(), std::numeric_limits::max()); @@ -30,7 +44,7 @@ bool test_median_blur(int index, RecreatedMessageQueue& request_queue, reply_queue, input); if (are_matrices_different(0, actual, expected)) { - fail_print_matrices(x, y, input, actual, expected); + fail_print_matrices(w, h, input, actual, expected); return true; } } @@ -82,6 +96,18 @@ std::vector& median_blur_tests_get() { TEST("Median 15x15, 1 channel (U8)", (test_median_blur<15, uint8_t, 1>), exec_median_blur<15>), TEST("Median 15x15, 3 channel (U8)", (test_median_blur<15, uint8_t, 3>), exec_median_blur<15>), TEST("Median 15x15, 4 channel (U8)", (test_median_blur<15, uint8_t, 4>), exec_median_blur<15>), + TEST("Median 17x17, 1 channel (U8)", (test_median_blur<17, uint8_t, 1>), exec_median_blur<17>), + TEST("Median 17x17, 3 channel (U8)", (test_median_blur<17, uint8_t, 3>), exec_median_blur<17>), + TEST("Median 17x17, 4 channel (U8)", (test_median_blur<17, uint8_t, 4>), exec_median_blur<17>), + TEST("Median 27x27, 1 channel (U8)", (test_median_blur<27, uint8_t, 1>), exec_median_blur<27>), + TEST("Median 27x27, 3 channel (U8)", (test_median_blur<27, uint8_t, 3>), exec_median_blur<27>), + TEST("Median 27x27, 4 channel (U8)", (test_median_blur<27, uint8_t, 4>), exec_median_blur<27>), + TEST("Median 35x35, 1 channel (U8)", (test_median_blur<35, uint8_t, 1>), exec_median_blur<35>), + TEST("Median 35x35, 3 channel (U8)", (test_median_blur<35, uint8_t, 3>), exec_median_blur<35>), + TEST("Median 35x35, 4 channel (U8)", (test_median_blur<35, uint8_t, 4>), exec_median_blur<35>), + TEST("Median 255x255, 1 channel (U8)", (test_median_blur<255, uint8_t, 1>), exec_median_blur<255>), + TEST("Median 255x255, 3 channel (U8)", (test_median_blur<255, uint8_t, 3>), exec_median_blur<255>), + TEST("Median 255x255, 4 channel (U8)", (test_median_blur<255, uint8_t, 4>), exec_median_blur<255>), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 255fb9beb..7ecdd4662 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -72,15 +72,15 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Rotate (90 degrees clockwise) | x | x | x | x | ## Image filters -| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | -|---------------------------------------------|-----|-----|-----|-----|-----|-----|-----| -| Erode | | x | | | | | | -| Dilate | | x | | | | | | -| Sobel (3x3) | | x | | | | | | -| Separable Filter 2D (5x5) | | x | x | x | | | | -| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | -| Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | -| Median Blur (9x9, 11x11, 13x13, 15x15) | | x | | | | | | +| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | +|----------------------------------------------|-----|-----|-----|-----|-----|-----|-----| +| Erode | | x | | | | | | +| Dilate | | x | | | | | | +| Sobel (3x3) | | x | | | | | | +| Separable Filter 2D (5x5) | | x | x | x | | | | +| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | +| Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | +| Median Blur (generic imp, max size 255x255) | | x | | | | | | ## Resize to quarter diff --git a/doc/opencv.md b/doc/opencv.md index 5175804c6..9672b1e58 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -166,10 +166,8 @@ Applies median filter to a given image. Notes on parameters: * `src.cols`,`src.rows` - image width and height must be greater than or equal to `ksize - 1` (e.g., `>= 4` for 5x5, `>= 14` for 15x15). -* `ksize` - - for `CV_8U`, supported kernel sizes are 3x3, 5×5, 7×7, 9×9, 11×11, 13×13, and 15×15. - - For other types, only 3x3, 5×5 and 7×7 are supported. - +* `ksize` - for `CV_8U`, supported kernel sizes are 3×3 to 255x255.\ + For other types, only 3x3, 5×5 and 7×7 are supported. ### [`cv::transpose()`](https://docs.opencv.org/4.10.0/d2/de8/group__core__array.html#ga46630ed6c0ea6254a35f447289bd7404) Transposes a matrix. diff --git a/kleidicv/include/kleidicv/filters/median_blur.h b/kleidicv/include/kleidicv/filters/median_blur.h index 16a30f7ac..31efc8da9 100644 --- a/kleidicv/include/kleidicv/filters/median_blur.h +++ b/kleidicv/include/kleidicv/filters/median_blur.h @@ -93,6 +93,16 @@ KLEIDICV_API_DECLARATION(kleidicv_median_blur_small_hist_stripe_u8, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, kleidicv::FixedBorderType border_type); + +// For internal use only. See instead kleidicv_median_blur_u8. +// Find a median across an image. +// The stripe is defined by the range (y_begin, y_end]. +KLEIDICV_API_DECLARATION(kleidicv_median_blur_large_hist_stripe_u8, + const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, + kleidicv::FixedBorderType border_type); } namespace kleidicv { @@ -108,6 +118,11 @@ kleidicv_error_t median_blur_small_hist_stripe_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, FixedBorderType border_type); + +kleidicv_error_t median_blur_large_hist_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, FixedBorderType border_type); } // namespace neon namespace sve2 { @@ -145,7 +160,7 @@ inline kleidicv_error_t check_ptrs_strides_imagesizes(const T *src, inline bool is_kernel_size_supported(size_t kernel_width, size_t kernel_height) { return (kernel_width == kernel_height) && (kernel_width >= 3) && - (kernel_width <= 15) && ((kernel_width % 2) != 0); + (kernel_width <= 255) && ((kernel_width % 2) != 0); } template diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 0b56a5a20..257470a2f 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -2058,12 +2058,12 @@ kleidicv_error_t kleidicv_warp_perspective_u8( /// @ref KLEIDICV_MAXIMUM_CHANNEL_COUNT. /// @param kernel_width Width of the Median kernel. Must be odd and equal to /// `kernel_height`. -/// For `uint8_t`, values 3 to 15 are supported. For other -/// types, only 3, 5 and 7. +/// For `uint8_t`, values 3 to 255 are supported. For +/// other types, only 3, 5 and 7. /// @param kernel_height Height of the Median kernel. Must be odd and equal to /// `kernel_width`. -/// For `uint8_t`, values 3 to 15 are supported. For other -/// types, only 3, 5 and 7. +/// For `uint8_t`, values 3 to 255 are supported. For +/// other types, only 3, 5 and 7. /// @param border_type Way of handling the border. The supported border types /// are: \n /// - @ref KLEIDICV_BORDER_TYPE_REPLICATE \n diff --git a/kleidicv/src/filters/median_blur_api.cpp b/kleidicv/src/filters/median_blur_api.cpp index 5c4b79c22..2a5e20ac1 100644 --- a/kleidicv/src/filters/median_blur_api.cpp +++ b/kleidicv/src/filters/median_blur_api.cpp @@ -33,6 +33,10 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_median_blur_small_hist_stripe_u8, &kleidicv::neon::median_blur_small_hist_stripe_u8, nullptr, nullptr); +KLEIDICV_MULTIVERSION_C_API(kleidicv_median_blur_large_hist_stripe_u8, + &kleidicv::neon::median_blur_large_hist_stripe_u8, + nullptr, nullptr); + extern "C" { kleidicv_error_t kleidicv_median_blur_s8(const int8_t *src, size_t src_stride, @@ -70,13 +74,19 @@ kleidicv_error_t kleidicv_median_blur_u8(const uint8_t *src, size_t src_stride, return checks_result; } - if (kernel_width > 7) { + if (kernel_width <= 7) { + return kleidicv_median_blur_sorting_network_stripe_u8( + src, src_stride, dst, dst_stride, width, height, 0, height, channels, + kernel_width, kernel_height, fixed_border_type); + } + + if (kernel_width > 7 && kernel_width <= 15) { return kleidicv_median_blur_small_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, 0, height, channels, kernel_width, kernel_height, fixed_border_type); } - return kleidicv_median_blur_sorting_network_stripe_u8( + return kleidicv_median_blur_large_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, 0, height, channels, kernel_width, kernel_height, fixed_border_type); } diff --git a/kleidicv/src/filters/median_blur_border_handling.h b/kleidicv/src/filters/median_blur_border_handling.h new file mode 100644 index 000000000..5a7913d84 --- /dev/null +++ b/kleidicv/src/filters/median_blur_border_handling.h @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_MEDIAN_BLUR_BORDER_HANDLING_H +#define KLEIDICV_MEDIAN_BLUR_BORDER_HANDLING_H + +#include + +#include "kleidicv/kleidicv.h" + +namespace kleidicv::neon { + +static ptrdiff_t get_physical_index(size_t index, size_t limit, + FixedBorderType border_type) { + int result = 0; + int signed_index = static_cast(index); + int signed_limit = static_cast(limit); + + if (signed_index >= 0 && signed_index < signed_limit) { + return static_cast(index); + } + switch (border_type) { + case FixedBorderType::REFLECT: { + if (signed_index < 0) { + result = -signed_index - 1; + } else { + result = 2 * signed_limit - signed_index - 1; + } + break; + } + + case FixedBorderType::WRAP: { + result = signed_index < 0 + ? signed_limit + signed_index + : (signed_index >= signed_limit ? signed_index - signed_limit + : signed_index); + break; + } + + case FixedBorderType::REVERSE: { + if (signed_index < 0) { + result = std::min(-signed_index, signed_limit - 1); + } else { + result = 2 * signed_limit - signed_index - 2; + } + break; + } + default: /* FixedBorderType::REPLICATE */ { + result = std::clamp(signed_index, 0, signed_limit - 1); + break; + } + } + + return static_cast(result); +} + +} // namespace kleidicv::neon +#endif // KLEIDICV_MEDIAN_BLUR_BORDER_HANDLING_H diff --git a/kleidicv/src/filters/median_blur_large_hist_neon.cpp b/kleidicv/src/filters/median_blur_large_hist_neon.cpp new file mode 100644 index 000000000..cf48df90f --- /dev/null +++ b/kleidicv/src/filters/median_blur_large_hist_neon.cpp @@ -0,0 +1,670 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/median_blur.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/neon.h" +#include "median_blur_border_handling.h" + +namespace kleidicv::neon { + +class MedianBlurLargeHist { + public: + MedianBlurLargeHist() : patched_coarse{}, patched_fine{}, H{}, luc{} {} + + template + void process_pixels_without_horizontal_borders( + Rectangle image_dimensions, Point starting_coordinates, + Point ending_coordinates, Rows src_rows, + Rows dst_rows, size_t ksize, FixedBorderType border_type) { + const size_t kMargin = ksize / 2; + constexpr size_t patch_length = 512; + + // Allocate histogram buffer for a patch plus extra margin pixels on both + // sides. + // To compute a complete ksize x ksize median filter for each pixel in the + // patch, we need kMargin extra pixels on the left and right (i.e., + // 2*kMargin total) so each patch row has enough context (e.g., for 5x5 + // filter, kMargin=2 → 1+2+2=5 pixels per row). + patched_coarse = + static_cast(malloc(16 * (patch_length + 2 * kMargin) * + src_rows.channels() * sizeof(uint16_t))); + patched_fine = + static_cast(malloc(256 * (patch_length + 2 * kMargin) * + src_rows.channels() * sizeof(uint16_t))); + + for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); + w += patch_length) { + const size_t total_patch_span = + std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; + + Rows shifted_dst{ + &dst_rows[0] + static_cast(w) * dst_rows.channels(), + dst_rows.stride(), dst_rows.channels()}; + + clear_patched_histogram(total_patch_span * src_rows.channels()); + + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + + // We initialize with ksize rows to allow merging of + // histogram increment and decrement operations in the main loop. + // This extra initial load enables a single update phase and avoids + // splitting the logic into separate steps. + for (size_t r = 0; r < ksize; ++r) { + const ptrdiff_t valid_h = + get_physical_index(starting_coordinates.y() + r - kMargin, + image_dimensions.height(), border_type); + initialize_patched_histogram( + src_rows, c, valid_h, total_patch_span, w, kMargin); + } + compute_patch_median_from_histogram(starting_coordinates.y(), c, + total_patch_span, kMargin, ksize, + shifted_dst); + } + + for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); + ++h) { + const ptrdiff_t valid_old_h = get_physical_index( + h - kMargin - 1, image_dimensions.height(), border_type); + + const ptrdiff_t valid_new_h = get_physical_index( + h + kMargin, image_dimensions.height(), border_type); + + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + + update_patch_histogram( + src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, + kMargin); + + compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, + ksize, shifted_dst); + } + } + } + + free(patched_coarse); + free(patched_fine); + } + + void process_pixels_with_horizontal_borders( + Rectangle image_dimensions, Point starting_coordinates, + Point ending_coordinates, Rows src_rows, + Rows dst_rows, size_t ksize, FixedBorderType border_type) { + const size_t kMargin = ksize / 2; + constexpr size_t patch_length = 512; + + patched_coarse = + static_cast(malloc(16 * (patch_length + 2 * kMargin) * + src_rows.channels() * sizeof(uint16_t))); + patched_fine = + static_cast(malloc(256 * (patch_length + 2 * kMargin) * + src_rows.channels() * sizeof(uint16_t))); + + for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); + w += patch_length) { + const size_t total_patch_span = + std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; + Rows shifted_dst{ + &dst_rows[0] + static_cast(w) * dst_rows.channels(), + dst_rows.stride(), dst_rows.channels()}; + + clear_patched_histogram(total_patch_span * src_rows.channels()); + + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + // We initialize with ksize rows to allow merging of + // histogram increment and decrement operations in the main loop. + // This extra initial load enables a single update phase and avoids + // splitting the logic into separate steps. + for (size_t r = 0; r < ksize; ++r) { + const ptrdiff_t valid_h = + get_physical_index(starting_coordinates.y() + r - kMargin, + image_dimensions.height(), border_type); + initialize_patched_histogram(src_rows, c, valid_h, total_patch_span, + w, kMargin, image_dimensions.width(), + border_type); + } + compute_patch_median_from_histogram(starting_coordinates.y(), c, + total_patch_span, kMargin, ksize, + shifted_dst); + } + + for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); + ++h) { + const ptrdiff_t valid_old_h = get_physical_index( + h - kMargin - 1, image_dimensions.height(), border_type); + const ptrdiff_t valid_new_h = get_physical_index( + h + kMargin, image_dimensions.height(), border_type); + for (ptrdiff_t c = 0; c < static_cast(src_rows.channels()); + ++c) { + clear_lookup_table(); + + update_patch_histogram(src_rows, c, valid_old_h, valid_new_h, + total_patch_span, w, kMargin, + image_dimensions.width(), border_type); + + compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, + ksize, shifted_dst); + } + } + } + + free(patched_coarse); + free(patched_fine); + } + + private: + // Histogram Buffer Layout Explanation + // ----------------------------------- + // patched_coarse: + // - Conceptually a 3D array: + // coarse[channel_idx][patch_offset][coarse_bin] + // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] + // - Flattened as: + // coarse_offset = 16 * (patch_length * channel_idx + patch_offset) + // + (incoming_pixel >> 4); + // + // patched_fine: + // - Conceptually a 4D array: + // fine[channel_idx][coarse_bin][patch_offset][fine_bin] + // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] + // - fine_bin = incoming_pixel & 0xF ∈ [0, 15] + // - Flattened as: + // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin) + // + patch_offset) + fine_bin; + // + // This layout enables fast linear access while preserving the hierarchical + // structure of histograms per channel and patch position. + + uint16_t* patched_coarse; + uint16_t* patched_fine; + + // Clear only the portion of the patched histogram buffers that will be used + // for the current patch. + // Since these buffers are large, there's no need to zero out the entire + // allocation— only the section relevant to the current total patch size is + // cleared for efficiency. + void clear_patched_histogram(size_t total_patch_size) { + std::memset(patched_coarse, 0, + 16 * total_patch_size * sizeof(patched_coarse[0])); + std::memset(patched_fine, 0, + 256 * total_patch_size * sizeof(patched_fine[0])); + } + + // will be used for scalar load + void initialize_patched_histogram(int incoming_pixel, size_t channel_idx, + size_t patch_length, size_t patch_offset) { + const size_t coarse_offset = + 16 * (patch_length * channel_idx + patch_offset) + + (incoming_pixel >> 4); + const size_t fine_offset = + 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) + + patch_offset) + + (incoming_pixel & 0xF); + patched_coarse[coarse_offset]++; + patched_fine[fine_offset]++; + } + + // will be used if we have vector load for single channel + void initialize_patched_histogram(uint8x16_t& incoming_pixels, + size_t channel_idx, size_t patch_length, + size_t patch_offset) { + KLEIDICV_FORCE_LOOP_UNROLL + for (int i = 0; i < 16; i++) { + const size_t coarse_offset_incoming = + 16 * (patch_length * channel_idx + patch_offset + i) + + (incoming_pixels[i] >> 4); + + const size_t fine_offset_incoming = + 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + + patch_offset + i) + + (incoming_pixels[i] & 0xF); + + patched_coarse[coarse_offset_incoming]++; + patched_fine[fine_offset_incoming]++; + } + } + + // This function is used without horizontal borders handling + template + void initialize_patched_histogram(Rows src_rows, ptrdiff_t c, + ptrdiff_t valid_h, size_t total_patch_span, + size_t starting_width, size_t kMargin) { + size_t vector_part = 0; + if constexpr (is_single_channel) { + vector_part = (total_patch_span >> 4) << 4; + for (size_t patch_offset = 0; patch_offset < vector_part; + patch_offset += 16) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]); + initialize_patched_histogram(incoming_pixels, c, total_patch_span, + patch_offset); + } + } + + for (size_t patch_offset = vector_part; patch_offset < total_patch_span; + ++patch_offset) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; + + initialize_patched_histogram(incoming_pixel, c, total_patch_span, + patch_offset); + } + } + + // This function is used with horizontal borders handling + void initialize_patched_histogram(Rows src_rows, ptrdiff_t c, + ptrdiff_t valid_h, size_t total_patch_span, + size_t starting_width, size_t kMargin, + size_t width, FixedBorderType border_type) { + for (size_t patch_offset = 0; patch_offset < total_patch_span; + ++patch_offset) { + ptrdiff_t valid_w = get_physical_index( + starting_width + patch_offset - kMargin, width, border_type); + + auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; + + initialize_patched_histogram(incoming_pixel, c, total_patch_span, + patch_offset); + } + } + + // During vertical traversal (the main 'height' loop), each sliding window + // iteration introduces a new incoming row and removes an outgoing one. The + // histogram must be updated accordingly by subtracting the contributions of + // the outgoing row and adding those of the incoming row. + // Both increment and decrement operations are handled inside the same + // this function will be used if we have a scalar load + void update_patch_histogram(int outgoing_pixel, int incoming_pixel, + size_t channel_idx, size_t patch_length, + size_t patch_offset) { + const size_t coarse_offset_base = + 16 * (patch_length * channel_idx + patch_offset); + const size_t fine_offset_base = + 16 * patch_length * 16 * channel_idx + 16 * patch_offset; + + const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4); + const size_t mask_new_pixel = (incoming_pixel & 0xF); + const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4); + const size_t mask_old_pixel = (outgoing_pixel & 0xF); + + const size_t fine_new_offset = fine_offset_base + mask_new_pixel + + 16 * patch_length * pixel_new_shift_right_4; + const size_t coarse_new_offset = + coarse_offset_base + pixel_new_shift_right_4; + + const size_t fine_old_offset = fine_offset_base + mask_old_pixel + + 16 * patch_length * pixel_old_shift_right_4; + const size_t coarse_old_offset = + coarse_offset_base + pixel_old_shift_right_4; + + patched_coarse[coarse_new_offset]++; + patched_coarse[coarse_old_offset]--; + patched_fine[fine_new_offset]++; + patched_fine[fine_old_offset]--; + } + + // this function will be used to handle vector load operation + void update_patch_histogram(uint8x16_t& outgoing_pixels, + uint8x16_t& incoming_pixels, size_t channel_idx, + size_t patch_length, size_t patch_offset) { + KLEIDICV_FORCE_LOOP_UNROLL + for (int i = 0; i < 16; i++) { + const size_t coarse_offset_incoming = + 16 * (patch_length * channel_idx + patch_offset + i) + + (incoming_pixels[i] >> 4); + + const size_t coarse_offset_outgoing = + 16 * (patch_length * channel_idx + patch_offset + i) + + (outgoing_pixels[i] >> 4); + + const size_t fine_offset_incoming = + 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + + patch_offset + i) + + (incoming_pixels[i] & 0xF); + + const size_t fine_offset_outgoing = + 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) + + patch_offset + i) + + (outgoing_pixels[i] & 0xF); + + patched_coarse[coarse_offset_incoming]++; + patched_coarse[coarse_offset_outgoing]--; + patched_fine[fine_offset_incoming]++; + patched_fine[fine_offset_outgoing]--; + } + } + + // This function is used without horizontal borders handling + template + void update_patch_histogram(Rows src_rows, ptrdiff_t c, + ptrdiff_t valid_old_h, ptrdiff_t valid_new_h, + size_t total_patch_span, size_t starting_width, + size_t kMargin) { + size_t vector_part = 0; + if constexpr (is_single_channel) { + vector_part = (total_patch_span >> 4) << 4; + // const uint16x8_t base = {0, 1, 2, 3, 4, 5, 6, 7}; + // uint16x8_t patch_offset_lo = base; + // uint16x8_t patch_offset_hi = patch_offset_lo; + for (size_t patch_offset = 0; patch_offset < vector_part; + patch_offset += 16) { + // patch_offset_lo = vaddq_u16(base, vdupq_n_u16(patch_offset)); + // patch_offset_hi = vaddq_u16(base, vdupq_n_u16(patch_offset + 8)); + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]); + auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]); + update_patch_histogram(outgoing_pixels, incoming_pixels, c, + total_patch_span, patch_offset); + } + } + + for (size_t patch_offset = vector_part; patch_offset < total_patch_span; + ++patch_offset) { + const ptrdiff_t valid_w = + static_cast(starting_width + patch_offset - kMargin); + auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; + auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; + update_patch_histogram(outgoing_pixel, incoming_pixel, c, + total_patch_span, patch_offset); + } + } + + // this functiom will be used for horizontal borders handling + void update_patch_histogram(Rows src_rows, ptrdiff_t c, + ptrdiff_t valid_old_h, ptrdiff_t valid_new_h, + size_t total_patch_span, size_t starting_width, + size_t kMargin, size_t width, + FixedBorderType border_type) { + for (size_t patch_offset = 0; patch_offset < total_patch_span; + ++patch_offset) { + const ptrdiff_t valid_w = get_physical_index( + starting_width + patch_offset - kMargin, width, border_type); + auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; + auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; + update_patch_histogram(outgoing_pixel, incoming_pixel, c, + total_patch_span, patch_offset); + } + } + + // `H` and `luc` are used for computing the median value for a single + // output pixel: + // - `H` is a histogram structure holding both the coarse and fine + // bins needed + // for the current element's median calculation. + // - `luc` is a lookup table that stores the last processed offset + // (index) + // for each coarse bin. This allows incremental fine histogram + // updates instead of full recalculation when histogram overlap is + // high. Since neighboring patches in natural images often have + // similar pixel values, reusing previous histogram state can + // significantly reduce processing time. + typedef struct { + uint16_t coarse[16]; + uint16_t fine[16][16]; + } Histogram; + + Histogram H; + uint16_t luc[16]; + + void clear_lookup_table(void) { + std::memset(&H, 0, sizeof(Histogram)); + std::memset(luc, 0, 16 * sizeof(uint16_t)); + } + + void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel, + uint16x8_t& v_coarseh, size_t kMargin) { + for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) { + v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); + v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); + } + } + + void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel, + uint16x8_t& v_coarseh) { + v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); + v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); + vst1q_u16(H.coarse, v_coarsel); + vst1q_u16(H.coarse + 8, v_coarseh); + } + + void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel, + uint16x8_t& v_coarseh) { + v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px)); + v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8)); + } + + void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, + uint16_t& luc, size_t patch_offset, size_t kMargin, + size_t total_patch_span) { + for (luc = static_cast(patch_offset - kMargin); + luc < static_cast( + std::min(patch_offset + kMargin + 1, total_patch_span)); + ++luc, px += 16) { + v_finel = vaddq_u16(v_finel, vld1q_u16(px)); + v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8)); + } + } + + void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, + uint16_t& luc, size_t patch_offset, size_t kMargin, + size_t total_patch_span) { + for (; luc < static_cast(patch_offset + kMargin + 1); ++luc) { + constexpr ptrdiff_t stride = 16; + const ptrdiff_t patch_span_limit = + static_cast(total_patch_span - 1); + const ptrdiff_t safe_luc = static_cast(luc); + + const ptrdiff_t base_offset = + stride * std::min(safe_luc, patch_span_limit); + const ptrdiff_t old_offset = + stride * std::max(safe_luc - static_cast(2 * kMargin + 1), + ptrdiff_t{0}); + const uint16x8_t new_vecl = vld1q_u16(px + base_offset); + const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8); + const uint16x8_t old_vecl = vld1q_u16(px + old_offset); + const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8); + v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl); + v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech); + } + } + + size_t find_coarse_index(size_t& cdf, size_t median_index) { + size_t coarse_index = 0; + while (true) { + cdf += H.coarse[coarse_index]; + if (cdf > median_index) { + cdf -= H.coarse[coarse_index]; + break; + } + coarse_index++; + } + return coarse_index; + } + + uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) { + uint16_t* segment = H.fine[coarse_index]; + size_t fine_index = 0; + while (true) { + cdf += segment[fine_index]; + if (cdf > median_index) { + fine_index = (16 * coarse_index + fine_index); + break; + } + fine_index++; + } + return uint8_t(fine_index); + } + + void compute_patch_median_from_histogram(size_t h, ptrdiff_t c, + size_t total_patch_span, + size_t kMargin, size_t ksize, + Rows dst) { + uint16x8_t v_coarsel = vld1q_u16(H.coarse); + uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8); + + // Before starting the main patch loop to compute medians for each element, + // we initialize the coarse histogram buffer with the first (ksize - 1). + // This allows each subsequent iteration in the patch loop to perform only + // one addition and one subtraction. + uint16_t* px = patched_coarse + 16 * total_patch_span * c; + initialize_coarse(px, v_coarsel, v_coarseh, kMargin); + + for (size_t patch_offset = kMargin; + patch_offset < total_patch_span - kMargin; patch_offset++) { + size_t median_index = (ksize * ksize) / 2, cdf = 0; + + px = patched_coarse + + 16 * (total_patch_span * c + + std::min(patch_offset + kMargin, total_patch_span - 1)); + increment_coarse(px, v_coarsel, v_coarseh); + + // Find median at coarse level + size_t coarse_index = find_coarse_index(cdf, median_index); + + uint16x8_t v_finel; + uint16x8_t v_fineh; + // Check whether the fine histogram (H.fine[coarse_index]) for the + // current patch position needs to be freshly initialized or can be + // incrementally updated. This decision hinges on the `luc` (Last Used + // Coordinate) table, which records the last horizontal patch offset + // processed for each coarse bin. + // + // The condition is true in two scenarios: + // + // 1. **First-Time Initialization**: + // - This is the first time we are accessing this coarse bin + // (`coarse_index`) at the current patch position. + // - We compute the full fine histogram from scratch by summing + // over `ksize` rows centered at the patch position. + // - We accumulate the results into the `v_finel` and `v_fineh` + // vector registers. + // - These vectors are then stored into `H.fine[coarse_index]`. + // - The `luc` table is updated to reflect the last index + // processed. + // + // 2. **Window Movement Causes Loss of Overlap**: + // - The sliding window has moved enough that it no longer + // sufficiently overlaps the region + // used to compute the previously cached fine histogram (i.e., + // `luc[coarse_index]` is too far behind). + // - We must reinitialize the fine histogram to ensure accuracy. + // + // Otherwise: + // + // - We reuse the previously computed fine histogram stored in + // `H.fine[coarse_index]`. + // - We only update it incrementally using the `update_fine()` + // function, which: + // - Adds the new values entering the window. + // - Subtracts the values leaving the window. + // - This avoids the need for a full re-scan, leveraging temporal + // locality between neighboring pixels. + // + // This lookup-based optimization significantly improves performance, + // as neighboring filter windows often overlap heavily—especially for + // small strides and moderate kernel sizes. + // + // After this step, the fine histogram is ready. The next phase scans + // `H.fine[coarse_index]` to identify the fine bin where the + // cumulative sum crosses the median threshold. This gives us the + // final median value for the output pixel. + if (luc[coarse_index] <= patch_offset - kMargin) { + v_finel = vdupq_n_u16(0); + v_fineh = vdupq_n_u16(0); + px = patched_fine + static_cast(16) * + (static_cast(total_patch_span) * + (16 * c + coarse_index) + + patch_offset - kMargin); + + initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, + kMargin, total_patch_span); + + } else { + v_finel = vld1q_u16(H.fine[coarse_index]); + v_fineh = vld1q_u16(H.fine[coarse_index] + 8); + px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index); + update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, + kMargin, total_patch_span); + } + + px = patched_coarse + + static_cast(16) * + (total_patch_span * c + + std::max(patch_offset - kMargin, static_cast(0))); + + vst1q_u16(H.fine[coarse_index], v_finel); + vst1q_u16(H.fine[coarse_index] + 8, v_fineh); + + decrement_coarse(px, v_coarsel, v_coarseh); + + // Find median at fine level + dst.at(static_cast(h), + static_cast(patch_offset - kMargin))[c] = + find_median(cdf, median_index, coarse_index); + } + } +}; + +kleidicv_error_t median_blur_large_hist_stripe_u8( + const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { + size_t kMargin = (kernel_width - 1) / 2; + MedianBlurLargeHist median_filter; + Rectangle image_dimensions{width, height}; + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + + // Process left border + size_t starting_width = 0; + size_t processing_left_width = kMargin; + Point starting_left_coordinates{starting_width, y_begin}; + Point ending_left_coordinates{starting_width + processing_left_width, y_end}; + median_filter.process_pixels_with_horizontal_borders( + image_dimensions, starting_left_coordinates, ending_left_coordinates, + src_rows, dst_rows, kernel_height, border_type); + + // Process center region + starting_width = processing_left_width; + size_t processing_center_width = width - 2 * kMargin; + Point starting_center_coordinates{starting_width, y_begin}; + Point ending_center_coordinates{starting_width + processing_center_width, + y_end}; + if (channels == 1) { + median_filter.process_pixels_without_horizontal_borders( + image_dimensions, starting_center_coordinates, + ending_center_coordinates, src_rows, dst_rows, kernel_height, + border_type); + } else { + median_filter.process_pixels_without_horizontal_borders( + image_dimensions, starting_center_coordinates, + ending_center_coordinates, src_rows, dst_rows, kernel_height, + border_type); + } + + // Process right border + starting_width = processing_left_width + processing_center_width; + size_t processing_right_width = + width - processing_left_width - processing_center_width; + Point starting_right_coordinates{starting_width, y_begin}; + Point ending_right_coordinates{starting_width + processing_right_width, + y_end}; + median_filter.process_pixels_with_horizontal_borders( + image_dimensions, starting_right_coordinates, ending_right_coordinates, + src_rows, dst_rows, kernel_height, border_type); + + return KLEIDICV_OK; +} + +} // namespace kleidicv::neon diff --git a/kleidicv/src/filters/median_blur_small_hist_neon.cpp b/kleidicv/src/filters/median_blur_small_hist_neon.cpp index 3c14f6eb0..217481be9 100644 --- a/kleidicv/src/filters/median_blur_small_hist_neon.cpp +++ b/kleidicv/src/filters/median_blur_small_hist_neon.cpp @@ -6,50 +6,10 @@ #include "kleidicv/filters/median_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "median_blur_border_handling.h" namespace kleidicv::neon { -static ptrdiff_t get_physical_index(size_t index, size_t limit, - FixedBorderType border_type) { - int result = 0; - int signed_index = static_cast(index); - int signed_limit = static_cast(limit); - - if (signed_index >= 0 && signed_index < signed_limit) { - return static_cast(index); - } - switch (border_type) { - case FixedBorderType::REPLICATE: { - result = std::clamp(signed_index, 0, signed_limit - 1); - break; - } - case FixedBorderType::REFLECT: { - if (signed_index < 0) { - result = -signed_index - 1; - } else { - result = 2 * signed_limit - signed_index - 1; - } - break; - } - - case FixedBorderType::WRAP: { - result = (signed_index + signed_limit) % signed_limit; - break; - } - - case FixedBorderType::REVERSE: { - if (signed_index < 0) { - result = std::min(-signed_index, signed_limit - 1); - } else { - result = 2 * signed_limit - signed_index - 2; - } - break; - } - } - - return static_cast(result); -} - // B. Weiss, "Fast Median and Bilateral Filtering," in *ACM SIGGRAPH 2006 // Papers*, ACM, New York, NY, USA, pp. 519–526, 2006. // The paper is currently available at: @@ -62,7 +22,7 @@ class MedianBlurSmallHist { Rectangle image_dimensions, Point starting_coordinates, Point ending_coordinates, Rows src_rows, Rows dst_rows, size_t ksize, FixedBorderType border_type) { - const size_t KMargin = ksize / 2; + const size_t kMargin = ksize / 2; for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w++) { for (ptrdiff_t ch = 0; ch < static_cast(src_rows.channels()); @@ -76,10 +36,10 @@ class MedianBlurSmallHist { for (size_t r = 0; r < ksize; r++) { for (size_t c = 0; c < ksize; c++) { const ptrdiff_t valid_h = - get_physical_index(starting_coordinates.y() + r - KMargin, + get_physical_index(starting_coordinates.y() + r - kMargin, image_dimensions.height(), border_type); const ptrdiff_t valid_w = get_physical_index( - w + c - KMargin, image_dimensions.width(), border_type); + w + c - kMargin, image_dimensions.width(), border_type); uint8_t pixel = src_rows.at(valid_h, valid_w)[ch]; @@ -95,14 +55,14 @@ class MedianBlurSmallHist { for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); h++) { const ptrdiff_t valid_new_h = get_physical_index( - h + KMargin, image_dimensions.height(), border_type); + h + kMargin, image_dimensions.height(), border_type); const ptrdiff_t valid_old_h = get_physical_index( - h - KMargin - 1, image_dimensions.height(), border_type); + h - kMargin - 1, image_dimensions.height(), border_type); for (size_t c = 0; c < ksize; c++) { const ptrdiff_t valid_w = get_physical_index( - w + c - KMargin, image_dimensions.width(), border_type); + w + c - kMargin, image_dimensions.width(), border_type); uint8_t incoming_pixel = src_rows.at(valid_new_h, valid_w)[ch]; @@ -390,11 +350,11 @@ kleidicv_error_t median_blur_small_hist_stripe_u8( Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; MedianBlurSmallHist median_filter; - const size_t KMargin = kernel_width / 2; + const size_t kMargin = kernel_width / 2; // Process left border size_t starting_width = 0; - const size_t processing_left_width = KMargin; + const size_t processing_left_width = kMargin; Point starting_left_coordinates{starting_width, y_begin}; Point ending_left_coordinates{starting_width + processing_left_width, y_end}; @@ -405,12 +365,12 @@ kleidicv_error_t median_blur_small_hist_stripe_u8( // Process center region starting_width = processing_left_width; // Compute the width of the center region that can be processed with NEON - // instructions. Subtract 2 * KMargin to exclude left and right borders, which + // instructions. Subtract 2 * kMargin to exclude left and right borders, which // are handled separately using scalar code due to varying border modes (e.g., // REPLICATE, REFLECT, WRAP, REVERSE). Align the remaining width down to the // nearest multiple of 16 to match NEON's 128-bit register width (16 bytes for // uint8x16_t). - const size_t processing_center_width = ((width - 2 * KMargin) / 16) * 16; + const size_t processing_center_width = ((width - 2 * kMargin) / 16) * 16; Point starting_center_coordinates{starting_width * channels, y_begin}; Point ending_center_coordinates{ (processing_center_width + starting_width) * channels, y_end}; diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 3d5ddb041..45a49ef0d 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -593,7 +593,16 @@ kleidicv_error_t kleidicv_thread_median_blur_u8( return checks_result; } - if (kernel_width > 7) { + if (kernel_width <= 7) { + auto callback = [=](unsigned y_begin, unsigned y_end) { + return kleidicv_median_blur_sorting_network_stripe_u8( + src, src_stride, dst, dst_stride, width, height, y_begin, y_end, + channels, kernel_width, kernel_height, fixed_border_type); + }; + return parallel_batches(callback, mt, height); + } + + if (kernel_width > 7 && kernel_width <= 15) { auto callback = [=](unsigned y_begin, unsigned y_end) { return kleidicv_median_blur_small_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, y_begin, y_end, @@ -603,7 +612,7 @@ kleidicv_error_t kleidicv_thread_median_blur_u8( } auto callback = [=](unsigned y_begin, unsigned y_end) { - return kleidicv_median_blur_sorting_network_stripe_u8( + return kleidicv_median_blur_large_hist_stripe_u8( src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, kernel_width, kernel_height, fixed_border_type); }; diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 3b22f42f2..4d8f8dc55 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -38,6 +38,9 @@ MedianBlur9x9: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, MedianBlur11x11: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 11)' MedianBlur13x13: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 13)' MedianBlur15x15: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 15)' +MedianBlur17x17: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 17)' +MedianBlur27x27: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 27)' +MedianBlur35x35: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 35)' GaussianBlur3x3: opencv_perf_imgproc '*gaussianBlur3x3/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur5x5: opencv_perf_imgproc '*gaussianBlur5x5/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' diff --git a/test/api/test_median_blur.cpp b/test/api/test_median_blur.cpp index c54ba43e7..ce9c6140a 100644 --- a/test/api/test_median_blur.cpp +++ b/test/api/test_median_blur.cpp @@ -65,13 +65,13 @@ class MedianBlurTest : public testing::Test { return cases; } - static std::vector get_unpadded_test_cases() { - std::vector widths = {50}; + static std::vector get_unpadded_test_cases(size_t filter_size) { + std::vector widths = {2 * filter_size + 16}; std::vector src_paddings = {0}; std::vector dst_paddings = {0}; - std::vector heights = {30}; - std::vector channels = {1, 4}; - std::vector filter_sizes = {3, 5, 7}; + std::vector heights = {2 * filter_size + 16}; + std::vector channels = {1, 2, 3, 4}; + std::vector filter_sizes = {filter_size}; std::vector border_types = { KLEIDICV_BORDER_TYPE_REPLICATE, KLEIDICV_BORDER_TYPE_REFLECT, KLEIDICV_BORDER_TYPE_WRAP, KLEIDICV_BORDER_TYPE_REVERSE}; @@ -80,13 +80,13 @@ class MedianBlurTest : public testing::Test { channels, filter_sizes, border_types); } - static std::vector get_padded_test_cases() { - std::vector widths = {20}; + static std::vector get_padded_test_cases(size_t filter_size) { + std::vector widths = {2 * filter_size + 16}; std::vector src_paddings = {5}; std::vector dst_paddings = {13}; - std::vector heights = {10}; - std::vector channels = {1, 4}; - std::vector filter_sizes = {3, 5, 7}; + std::vector heights = {2 * filter_size + 16}; + std::vector channels = {1, 2, 3, 4}; + std::vector filter_sizes = {filter_size}; std::vector border_types = { KLEIDICV_BORDER_TYPE_REPLICATE}; @@ -94,7 +94,7 @@ class MedianBlurTest : public testing::Test { channels, filter_sizes, border_types); } - static std::vector get_small_image_test_cases( + static std::vector get_small_range_filter_test_cases( size_t filter_size) { std::vector widths = {25, filter_size - 1}; std::vector src_paddings = {0}; @@ -124,6 +124,21 @@ class MedianBlurTest : public testing::Test { channels, filter_sizes, border_types); } + static std::vector get_large_range_filter_test_cases( + size_t filter_size) { + std::vector widths = {60}; + std::vector src_paddings = {0}; + std::vector dst_paddings = {5}; + std::vector heights = {60, filter_size - 1}; + std::vector channels = {1, 3}; + std::vector filter_sizes = {filter_size}; + std::vector border_types = { + KLEIDICV_BORDER_TYPE_REPLICATE, KLEIDICV_BORDER_TYPE_REFLECT, + KLEIDICV_BORDER_TYPE_WRAP, KLEIDICV_BORDER_TYPE_REVERSE}; + return generate_test_cases(widths, src_paddings, dst_paddings, heights, + channels, filter_sizes, border_types); + } + void run_test_case(const TestParams& params) { test::Array2D src{params.width * params.channels, params.height, params.src_padding, @@ -233,21 +248,42 @@ using ElementTypes = ::testing::Types; TYPED_TEST_SUITE(MedianBlurTest, ElementTypes); -TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithoutPadding) { +TYPED_TEST(MedianBlurTest, + RunAllParamCombinationsWithoutPaddingWithSmallFilterSize) { + if (test::Options::are_long_running_tests_skipped()) { + GTEST_SKIP() + << "Long running test " + "MedianBlurTest::" + "RunAllParamCombinationsWithoutPaddingWithSmallFilterSize skipped"; + } + + for (auto ksize : {3, 5, 7}) { + for (const auto& params : TestFixture::get_unpadded_test_cases(ksize)) { + this->run_test_case(params); + } + } +} + +TYPED_TEST(MedianBlurTest, + RunAllParamCombinationsWithPaddingWithSmallFilterSize) { if (test::Options::are_long_running_tests_skipped()) { GTEST_SKIP() << "Long running test " - "MedianBlurTest::RunAllParamCombinationsWithoutPadding " + "MedianBlurTest::" + "RunAllParamCombinationsWithPaddingWithSmallFilterSize " "skipped"; } - for (const auto& params : TestFixture::get_unpadded_test_cases()) { - this->run_test_case(params); + for (auto ksize : {3, 5, 7}) { + for (const auto& params : TestFixture::get_padded_test_cases(ksize)) { + this->run_test_case(params); + } } } -TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithSmallImageSize) { +TYPED_TEST(MedianBlurTest, RunAllParamCombinationsWithSmallImageAndFilterSize) { for (auto ksize : {3, 5, 7}) { - for (const auto& params : TestFixture::get_small_image_test_cases(ksize)) { + for (const auto& params : + TestFixture::get_small_range_filter_test_cases(ksize)) { this->run_test_case(params); } } @@ -426,32 +462,57 @@ TYPED_TEST(MedianBlurTest, SrcDstChannelCombinations) { EXPECT_EQ(KLEIDICV_OK, status); } } - template -class MedianBlurByteStrideTest : public MedianBlurTest {}; +class MedianBlurMidAndLargeRangeTest : public MedianBlurTest {}; +using ByteType = ::testing::Types; +TYPED_TEST_SUITE(MedianBlurMidAndLargeRangeTest, ByteType); +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithSmallImageAndMidRangeFilterSize) { + for (const auto& params : TestFixture::get_mid_range_filter_test_cases()) { + this->run_test_case(params); + } +} -using ByteStrideTypes = ::testing::Types; -TYPED_TEST_SUITE(MedianBlurByteStrideTest, ByteStrideTypes); +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithSmallImageAndLargeRangeFilterSize) { + for (auto ksize : {17, 35}) { + for (const auto& params : + TestFixture::get_large_range_filter_test_cases(ksize)) { + this->run_test_case(params); + } + } +} -TYPED_TEST(MedianBlurByteStrideTest, RunAllParamCombinationsWithPadding) { +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithoutPaddingWithMidAndLargeFilterSize) { if (test::Options::are_long_running_tests_skipped()) { GTEST_SKIP() << "Long running test " - "MedianBlurByteStrideTest::RunAllParamCombinationsWithPadding " + "MedianBlurMidAndLargeRangeTest::" + "RunAllParamCombinationsWithoutPaddingWithMidAndLargeFilterSize " "skipped"; } - for (const auto& params : TestFixture::get_padded_test_cases()) { - this->run_test_case(params); + for (auto ksize : {9, 15, 17, 255}) { + for (const auto& params : TestFixture::get_unpadded_test_cases(ksize)) { + this->run_test_case(params); + } } } -template -class MedianBlurMidRangeTest : public MedianBlurTest {}; -using ByteType = ::testing::Types; -TYPED_TEST_SUITE(MedianBlurMidRangeTest, ByteType); -TYPED_TEST(MedianBlurMidRangeTest, RunAllParamCombinationsWithMidRangeFilters) { - for (const auto& params : TestFixture::get_mid_range_filter_test_cases()) { - this->run_test_case(params); +TYPED_TEST(MedianBlurMidAndLargeRangeTest, + RunAllParamCombinationsWithPaddingWithMidAndLargeFilterSize) { + if (test::Options::are_long_running_tests_skipped()) { + GTEST_SKIP() + << "Long running test " + "MedianBlurMidAndLargeRangeTest::" + "RunAllParamCombinationsWithPaddingWithMidAndLargeFilterSize " + "skipped"; + } + + for (auto ksize : {9, 15, 17, 255}) { + for (const auto& params : TestFixture::get_padded_test_cases(ksize)) { + this->run_test_case(params); + } } } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index f93d8b8cc..0dafa07ed 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -113,7 +113,7 @@ class Thread : public testing::TestWithParam

{ (void)thread_count; size_t channels = 1; kleidicv_border_type_t border_type = KLEIDICV_BORDER_TYPE_REPLICATE; - const auto &filter_size = std::vector{3, 5, 7, 9}; + const auto &filter_size = std::vector{3, 5, 7, 9, 17}; for (auto ksize : filter_size) { check_unary_op(single_threaded_func, multithreaded_func, channels, channels, channels, ksize, ksize, border_type); -- GitLab From 788fc5326fb9190966957c8c32c0b92bd47d832e Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 15:19:09 +0100 Subject: [PATCH 2/9] fix building issue for FixedBorderType::WRAP --- kleidicv/src/filters/median_blur_border_handling.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kleidicv/src/filters/median_blur_border_handling.h b/kleidicv/src/filters/median_blur_border_handling.h index 5a7913d84..17900440d 100644 --- a/kleidicv/src/filters/median_blur_border_handling.h +++ b/kleidicv/src/filters/median_blur_border_handling.h @@ -31,10 +31,13 @@ static ptrdiff_t get_physical_index(size_t index, size_t limit, } case FixedBorderType::WRAP: { - result = signed_index < 0 - ? signed_limit + signed_index - : (signed_index >= signed_limit ? signed_index - signed_limit - : signed_index); + if (signed_index < 0) { + result = signed_limit + signed_index; + } else if (signed_index >= signed_limit) { + result = signed_index - signed_limit; + } else { + result = signed_index; + } break; } -- GitLab From 7882139d0d555eb70be7833eb086c1d796f41fbc Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 15:22:53 +0100 Subject: [PATCH 3/9] Remove extra space from the table --- doc/functionality.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/functionality.md b/doc/functionality.md index 7ecdd4662..984e784e0 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -72,15 +72,15 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Rotate (90 degrees clockwise) | x | x | x | x | ## Image filters -| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | -|----------------------------------------------|-----|-----|-----|-----|-----|-----|-----| -| Erode | | x | | | | | | -| Dilate | | x | | | | | | -| Sobel (3x3) | | x | | | | | | -| Separable Filter 2D (5x5) | | x | x | x | | | | -| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | -| Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | -| Median Blur (generic imp, max size 255x255) | | x | | | | | | +| | s8 | u8 | s16 | u16 | s32 | u32 | f32 | +|---------------------------------------------|-----|-----|-----|-----|-----|-----|-----| +| Erode | | x | | | | | | +| Dilate | | x | | | | | | +| Sobel (3x3) | | x | | | | | | +| Separable Filter 2D (5x5) | | x | x | x | | | | +| Gaussian Blur (3x3, 5x5, 7x7, 15x15, 21x21) | | x | | | | | | +| Median Blur (3x3, 5x5, 7x7) | x | x | x | x | x | x | x | +| Median Blur (generic imp, max size 255x255) | | x | | | | | | ## Resize to quarter -- GitLab From 1ff77ce6da7dbc9963c5bfce60faff824b9b56ac Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 15:24:58 +0100 Subject: [PATCH 4/9] Add median blur size limit inside changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71c3cee7f..2584beee1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This changelog aims to follow the guiding principles of ### Added - Median Blur for 3x3 kernels. -- Median Blur for generic kernels, Neon backend only. +- Median Blur for generic kernels (odd-sized only, max kernel size 255x255), Neon backend only. ### Changed - Performance of Gaussian Blur is greatly improved in return for some accuracy. -- GitLab From db6b5b21fe2ab78e58c2cb8f1ef1b917fd6afab9 Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 16:31:59 +0100 Subject: [PATCH 5/9] Refactor Median blur with large histogram --- .../filters/median_blur_large_hist_neon.cpp | 213 ++++++++---------- .../filters/median_blur_small_hist_neon.cpp | 2 +- 2 files changed, 100 insertions(+), 115 deletions(-) diff --git a/kleidicv/src/filters/median_blur_large_hist_neon.cpp b/kleidicv/src/filters/median_blur_large_hist_neon.cpp index cf48df90f..9a760535d 100644 --- a/kleidicv/src/filters/median_blur_large_hist_neon.cpp +++ b/kleidicv/src/filters/median_blur_large_hist_neon.cpp @@ -10,30 +10,31 @@ namespace kleidicv::neon { +template class MedianBlurLargeHist { public: - MedianBlurLargeHist() : patched_coarse{}, patched_fine{}, H{}, luc{} {} - - template - void process_pixels_without_horizontal_borders( - Rectangle image_dimensions, Point starting_coordinates, - Point ending_coordinates, Rows src_rows, - Rows dst_rows, size_t ksize, FixedBorderType border_type) { - const size_t kMargin = ksize / 2; - constexpr size_t patch_length = 512; - + MedianBlurLargeHist(size_t channels, size_t kMargin) + : patched_coarse{}, patched_fine{}, H{}, luc{} { // Allocate histogram buffer for a patch plus extra margin pixels on both // sides. // To compute a complete ksize x ksize median filter for each pixel in the // patch, we need kMargin extra pixels on the left and right (i.e., // 2*kMargin total) so each patch row has enough context (e.g., for 5x5 // filter, kMargin=2 → 1+2+2=5 pixels per row). - patched_coarse = - static_cast(malloc(16 * (patch_length + 2 * kMargin) * - src_rows.channels() * sizeof(uint16_t))); + patched_coarse = static_cast( + malloc((16 + 256) * (patch_length + 2 * kMargin) * channels * + sizeof(uint16_t))); patched_fine = - static_cast(malloc(256 * (patch_length + 2 * kMargin) * - src_rows.channels() * sizeof(uint16_t))); + &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]; + } + + ~MedianBlurLargeHist() { free(patched_coarse); } + + void process_pixels_without_horizontal_borders( + Rectangle image_dimensions, Point starting_coordinates, + Point ending_coordinates, Rows src_rows, + Rows dst_rows, size_t ksize, FixedBorderType border_type) { + const size_t kMargin = ksize / 2; for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w += patch_length) { @@ -58,7 +59,7 @@ class MedianBlurLargeHist { const ptrdiff_t valid_h = get_physical_index(starting_coordinates.y() + r - kMargin, image_dimensions.height(), border_type); - initialize_patched_histogram( + initialize_patched_histogram_without_horizontal_borders( src_rows, c, valid_h, total_patch_span, w, kMargin); } compute_patch_median_from_histogram(starting_coordinates.y(), c, @@ -78,7 +79,7 @@ class MedianBlurLargeHist { ++c) { clear_lookup_table(); - update_patch_histogram( + update_patch_histogram_without_horizontal_borders( src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, kMargin); @@ -87,9 +88,6 @@ class MedianBlurLargeHist { } } } - - free(patched_coarse); - free(patched_fine); } void process_pixels_with_horizontal_borders( @@ -97,14 +95,6 @@ class MedianBlurLargeHist { Point ending_coordinates, Rows src_rows, Rows dst_rows, size_t ksize, FixedBorderType border_type) { const size_t kMargin = ksize / 2; - constexpr size_t patch_length = 512; - - patched_coarse = - static_cast(malloc(16 * (patch_length + 2 * kMargin) * - src_rows.channels() * sizeof(uint16_t))); - patched_fine = - static_cast(malloc(256 * (patch_length + 2 * kMargin) * - src_rows.channels() * sizeof(uint16_t))); for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w += patch_length) { @@ -127,9 +117,9 @@ class MedianBlurLargeHist { const ptrdiff_t valid_h = get_physical_index(starting_coordinates.y() + r - kMargin, image_dimensions.height(), border_type); - initialize_patched_histogram(src_rows, c, valid_h, total_patch_span, - w, kMargin, image_dimensions.width(), - border_type); + initialize_patched_histogram_with_horizontal_borders( + src_rows, c, valid_h, total_patch_span, w, kMargin, + image_dimensions.width(), border_type); } compute_patch_median_from_histogram(starting_coordinates.y(), c, total_patch_span, kMargin, ksize, @@ -146,18 +136,15 @@ class MedianBlurLargeHist { ++c) { clear_lookup_table(); - update_patch_histogram(src_rows, c, valid_old_h, valid_new_h, - total_patch_span, w, kMargin, - image_dimensions.width(), border_type); + update_patch_histogram_with_horizontal_borders( + src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, + kMargin, image_dimensions.width(), border_type); compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, ksize, shifted_dst); } } } - - free(patched_coarse); - free(patched_fine); } private: @@ -182,7 +169,7 @@ class MedianBlurLargeHist { // // This layout enables fast linear access while preserving the hierarchical // structure of histograms per channel and patch position. - + constexpr static size_t patch_length = 512; uint16_t* patched_coarse; uint16_t* patched_fine; @@ -198,9 +185,10 @@ class MedianBlurLargeHist { 256 * total_patch_size * sizeof(patched_fine[0])); } - // will be used for scalar load - void initialize_patched_histogram(int incoming_pixel, size_t channel_idx, - size_t patch_length, size_t patch_offset) { + void scalar_initialize_patched_histogram(int incoming_pixel, + size_t channel_idx, + size_t patch_length, + size_t patch_offset) { const size_t coarse_offset = 16 * (patch_length * channel_idx + patch_offset) + (incoming_pixel >> 4); @@ -212,10 +200,10 @@ class MedianBlurLargeHist { patched_fine[fine_offset]++; } - // will be used if we have vector load for single channel - void initialize_patched_histogram(uint8x16_t& incoming_pixels, - size_t channel_idx, size_t patch_length, - size_t patch_offset) { + void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels, + size_t channel_idx, + size_t patch_length, + size_t patch_offset) { KLEIDICV_FORCE_LOOP_UNROLL for (int i = 0; i < 16; i++) { const size_t coarse_offset_incoming = @@ -232,11 +220,9 @@ class MedianBlurLargeHist { } } - // This function is used without horizontal borders handling - template - void initialize_patched_histogram(Rows src_rows, ptrdiff_t c, - ptrdiff_t valid_h, size_t total_patch_span, - size_t starting_width, size_t kMargin) { + void initialize_patched_histogram_without_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_h, + size_t total_patch_span, size_t starting_width, size_t kMargin) { size_t vector_part = 0; if constexpr (is_single_channel) { vector_part = (total_patch_span >> 4) << 4; @@ -245,8 +231,8 @@ class MedianBlurLargeHist { const ptrdiff_t valid_w = static_cast(starting_width + patch_offset - kMargin); auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]); - initialize_patched_histogram(incoming_pixels, c, total_patch_span, - patch_offset); + vector_initialize_patched_histogram(incoming_pixels, c, + total_patch_span, patch_offset); } } @@ -256,16 +242,15 @@ class MedianBlurLargeHist { static_cast(starting_width + patch_offset - kMargin); auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; - initialize_patched_histogram(incoming_pixel, c, total_patch_span, - patch_offset); + scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, + patch_offset); } } - // This function is used with horizontal borders handling - void initialize_patched_histogram(Rows src_rows, ptrdiff_t c, - ptrdiff_t valid_h, size_t total_patch_span, - size_t starting_width, size_t kMargin, - size_t width, FixedBorderType border_type) { + void initialize_patched_histogram_with_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_h, + size_t total_patch_span, size_t starting_width, size_t kMargin, + size_t width, FixedBorderType border_type) { for (size_t patch_offset = 0; patch_offset < total_patch_span; ++patch_offset) { ptrdiff_t valid_w = get_physical_index( @@ -273,8 +258,8 @@ class MedianBlurLargeHist { auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; - initialize_patched_histogram(incoming_pixel, c, total_patch_span, - patch_offset); + scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, + patch_offset); } } @@ -283,10 +268,10 @@ class MedianBlurLargeHist { // histogram must be updated accordingly by subtracting the contributions of // the outgoing row and adding those of the incoming row. // Both increment and decrement operations are handled inside the same - // this function will be used if we have a scalar load - void update_patch_histogram(int outgoing_pixel, int incoming_pixel, - size_t channel_idx, size_t patch_length, - size_t patch_offset) { + // function. + void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel, + size_t channel_idx, size_t patch_length, + size_t patch_offset) { const size_t coarse_offset_base = 16 * (patch_length * channel_idx + patch_offset); const size_t fine_offset_base = @@ -313,10 +298,10 @@ class MedianBlurLargeHist { patched_fine[fine_old_offset]--; } - // this function will be used to handle vector load operation - void update_patch_histogram(uint8x16_t& outgoing_pixels, - uint8x16_t& incoming_pixels, size_t channel_idx, - size_t patch_length, size_t patch_offset) { + void vector_update_patch_histogram(uint8x16_t& outgoing_pixels, + uint8x16_t& incoming_pixels, + size_t channel_idx, size_t patch_length, + size_t patch_offset) { KLEIDICV_FORCE_LOOP_UNROLL for (int i = 0; i < 16; i++) { const size_t coarse_offset_incoming = @@ -344,28 +329,21 @@ class MedianBlurLargeHist { } } - // This function is used without horizontal borders handling - template - void update_patch_histogram(Rows src_rows, ptrdiff_t c, - ptrdiff_t valid_old_h, ptrdiff_t valid_new_h, - size_t total_patch_span, size_t starting_width, - size_t kMargin) { + void update_patch_histogram_without_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, + ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, + size_t kMargin) { size_t vector_part = 0; if constexpr (is_single_channel) { vector_part = (total_patch_span >> 4) << 4; - // const uint16x8_t base = {0, 1, 2, 3, 4, 5, 6, 7}; - // uint16x8_t patch_offset_lo = base; - // uint16x8_t patch_offset_hi = patch_offset_lo; for (size_t patch_offset = 0; patch_offset < vector_part; patch_offset += 16) { - // patch_offset_lo = vaddq_u16(base, vdupq_n_u16(patch_offset)); - // patch_offset_hi = vaddq_u16(base, vdupq_n_u16(patch_offset + 8)); const ptrdiff_t valid_w = static_cast(starting_width + patch_offset - kMargin); auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]); auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]); - update_patch_histogram(outgoing_pixels, incoming_pixels, c, - total_patch_span, patch_offset); + vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c, + total_patch_span, patch_offset); } } @@ -375,25 +353,23 @@ class MedianBlurLargeHist { static_cast(starting_width + patch_offset - kMargin); auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; - update_patch_histogram(outgoing_pixel, incoming_pixel, c, - total_patch_span, patch_offset); + scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, + total_patch_span, patch_offset); } } - // this functiom will be used for horizontal borders handling - void update_patch_histogram(Rows src_rows, ptrdiff_t c, - ptrdiff_t valid_old_h, ptrdiff_t valid_new_h, - size_t total_patch_span, size_t starting_width, - size_t kMargin, size_t width, - FixedBorderType border_type) { + void update_patch_histogram_with_horizontal_borders( + Rows src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, + ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, + size_t kMargin, size_t width, FixedBorderType border_type) { for (size_t patch_offset = 0; patch_offset < total_patch_span; ++patch_offset) { const ptrdiff_t valid_w = get_physical_index( starting_width + patch_offset - kMargin, width, border_type); auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; - update_patch_histogram(outgoing_pixel, incoming_pixel, c, - total_patch_span, patch_offset); + scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, + total_patch_span, patch_offset); } } @@ -616,15 +592,14 @@ class MedianBlurLargeHist { } }; -kleidicv_error_t median_blur_large_hist_stripe_u8( - const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, - size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, - size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { +template +void median_process(Rectangle image_dimensions, Rows src_rows, + Rows dst_rows, size_t y_begin, size_t y_end, + size_t kernel_width, size_t kernel_height, + FixedBorderType border_type) { size_t kMargin = (kernel_width - 1) / 2; - MedianBlurLargeHist median_filter; - Rectangle image_dimensions{width, height}; - Rows src_rows{src, src_stride, channels}; - Rows dst_rows{dst, dst_stride, channels}; + MedianBlurLargeHist median_filter{src_rows.channels(), + kMargin}; // Process left border size_t starting_width = 0; @@ -637,32 +612,42 @@ kleidicv_error_t median_blur_large_hist_stripe_u8( // Process center region starting_width = processing_left_width; - size_t processing_center_width = width - 2 * kMargin; + size_t processing_center_width = image_dimensions.width() - 2 * kMargin; Point starting_center_coordinates{starting_width, y_begin}; Point ending_center_coordinates{starting_width + processing_center_width, y_end}; - if (channels == 1) { - median_filter.process_pixels_without_horizontal_borders( - image_dimensions, starting_center_coordinates, - ending_center_coordinates, src_rows, dst_rows, kernel_height, - border_type); - } else { - median_filter.process_pixels_without_horizontal_borders( - image_dimensions, starting_center_coordinates, - ending_center_coordinates, src_rows, dst_rows, kernel_height, - border_type); - } + median_filter.process_pixels_without_horizontal_borders( + image_dimensions, starting_center_coordinates, ending_center_coordinates, + src_rows, dst_rows, kernel_height, border_type); // Process right border starting_width = processing_left_width + processing_center_width; - size_t processing_right_width = - width - processing_left_width - processing_center_width; + size_t processing_right_width = image_dimensions.width() - + processing_left_width - + processing_center_width; Point starting_right_coordinates{starting_width, y_begin}; Point ending_right_coordinates{starting_width + processing_right_width, y_end}; median_filter.process_pixels_with_horizontal_borders( image_dimensions, starting_right_coordinates, ending_right_coordinates, src_rows, dst_rows, kernel_height, border_type); +} + +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8( + const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { + Rectangle image_dimensions{width, height}; + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + + if (channels == 1) { + median_process(image_dimensions, src_rows, dst_rows, y_begin, y_end, + kernel_width, kernel_height, border_type); + } else { + median_process(image_dimensions, src_rows, dst_rows, y_begin, y_end, + kernel_width, kernel_height, border_type); + } return KLEIDICV_OK; } diff --git a/kleidicv/src/filters/median_blur_small_hist_neon.cpp b/kleidicv/src/filters/median_blur_small_hist_neon.cpp index 217481be9..32ab921a8 100644 --- a/kleidicv/src/filters/median_blur_small_hist_neon.cpp +++ b/kleidicv/src/filters/median_blur_small_hist_neon.cpp @@ -342,7 +342,7 @@ class MedianBlurSmallHist { } }; -kleidicv_error_t median_blur_small_hist_stripe_u8( +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_small_hist_stripe_u8( const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { -- GitLab From a56fae494b9eb860495261632d21771043165a43 Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 16:48:43 +0100 Subject: [PATCH 6/9] fix building issue --- .../filters/median_blur_large_hist_neon.cpp | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/kleidicv/src/filters/median_blur_large_hist_neon.cpp b/kleidicv/src/filters/median_blur_large_hist_neon.cpp index 9a760535d..cae3fa15c 100644 --- a/kleidicv/src/filters/median_blur_large_hist_neon.cpp +++ b/kleidicv/src/filters/median_blur_large_hist_neon.cpp @@ -14,19 +14,13 @@ template class MedianBlurLargeHist { public: MedianBlurLargeHist(size_t channels, size_t kMargin) - : patched_coarse{}, patched_fine{}, H{}, luc{} { - // Allocate histogram buffer for a patch plus extra margin pixels on both - // sides. - // To compute a complete ksize x ksize median filter for each pixel in the - // patch, we need kMargin extra pixels on the left and right (i.e., - // 2*kMargin total) so each patch row has enough context (e.g., for 5x5 - // filter, kMargin=2 → 1+2+2=5 pixels per row). - patched_coarse = static_cast( - malloc((16 + 256) * (patch_length + 2 * kMargin) * channels * - sizeof(uint16_t))); - patched_fine = - &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]; - } + : patched_coarse{static_cast( + malloc((16 + 256) * (patch_length + 2 * kMargin) * channels * + sizeof(uint16_t)))}, + patched_fine{ + &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]}, + H{}, + luc{} {} ~MedianBlurLargeHist() { free(patched_coarse); } -- GitLab From 6b5c4834a7addf14194b32b774279fec0be3814a Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 17:26:28 +0100 Subject: [PATCH 7/9] Fix coverage for median --- .../include/kleidicv/filters/median_blur.h | 12 ++++++--- .../src/filters/median_blur_border_handling.h | 4 +-- .../median_blur_sorting_network_neon.cpp | 14 +++++----- .../filters/median_blur_sorting_network_sc.h | 14 +++++----- test/api/test_median_blur.cpp | 27 +++++++------------ 5 files changed, 31 insertions(+), 40 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/median_blur.h b/kleidicv/include/kleidicv/filters/median_blur.h index 31efc8da9..70329a3e9 100644 --- a/kleidicv/include/kleidicv/filters/median_blur.h +++ b/kleidicv/include/kleidicv/filters/median_blur.h @@ -157,10 +157,16 @@ inline kleidicv_error_t check_ptrs_strides_imagesizes(const T *src, return KLEIDICV_OK; } +template inline bool is_kernel_size_supported(size_t kernel_width, size_t kernel_height) { - return (kernel_width == kernel_height) && (kernel_width >= 3) && - (kernel_width <= 255) && ((kernel_width % 2) != 0); + if (std::is_same_v) { + return (kernel_width == kernel_height) && (kernel_width >= 3) && + (kernel_width <= 255) && ((kernel_width % 2) != 0); + } else { + return (kernel_width == kernel_height) && (kernel_width >= 3) && + (kernel_width <= 7) && ((kernel_width % 2) != 0); + } } template @@ -178,7 +184,7 @@ inline std::pair median_blur_is_implemented( if ((src != dst) && (channels <= KLEIDICV_MAXIMUM_CHANNEL_COUNT) && (height >= kernel_height - 1) && (width >= kernel_width - 1) && - is_kernel_size_supported(kernel_width, kernel_height) && + is_kernel_size_supported(kernel_width, kernel_height) && fixed_border_type.has_value()) { return std::make_pair(KLEIDICV_OK, *fixed_border_type); } diff --git a/kleidicv/src/filters/median_blur_border_handling.h b/kleidicv/src/filters/median_blur_border_handling.h index 17900440d..3feab6d41 100644 --- a/kleidicv/src/filters/median_blur_border_handling.h +++ b/kleidicv/src/filters/median_blur_border_handling.h @@ -33,10 +33,8 @@ static ptrdiff_t get_physical_index(size_t index, size_t limit, case FixedBorderType::WRAP: { if (signed_index < 0) { result = signed_limit + signed_index; - } else if (signed_index >= signed_limit) { - result = signed_index - signed_limit; } else { - result = signed_index; + result = signed_index - signed_limit; } break; } diff --git a/kleidicv/src/filters/median_blur_sorting_network_neon.cpp b/kleidicv/src/filters/median_blur_sorting_network_neon.cpp index e46e37c9a..645cb2ea6 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_neon.cpp +++ b/kleidicv/src/filters/median_blur_sorting_network_neon.cpp @@ -187,6 +187,7 @@ kleidicv_error_t median_blur_sorting_network_stripe( border_type, filter); return KLEIDICV_OK; } + if (kernel_width == 5) { MedianBlurSortingNetwork median_filter; Filter2D5x5> filter{median_filter}; @@ -194,15 +195,12 @@ kleidicv_error_t median_blur_sorting_network_stripe( filter); return KLEIDICV_OK; } - if (kernel_width == 7) { - MedianBlurSortingNetwork median_filter; - Filter2D7x7> filter{median_filter}; - process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, - filter); - return KLEIDICV_OK; - } - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + MedianBlurSortingNetwork median_filter; + Filter2D7x7> filter{median_filter}; + process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, + filter); + return KLEIDICV_OK; } #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ diff --git a/kleidicv/src/filters/median_blur_sorting_network_sc.h b/kleidicv/src/filters/median_blur_sorting_network_sc.h index 895b363c3..f8496b3e8 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_sc.h +++ b/kleidicv/src/filters/median_blur_sorting_network_sc.h @@ -144,6 +144,7 @@ kleidicv_error_t median_blur_sorting_network_stripe_sc( border_type, filter); return KLEIDICV_OK; } + if (kernel_width == 5) { MedianBlurSortingNetwork median_filter; Filter2D5x5> filter{median_filter}; @@ -151,15 +152,12 @@ kleidicv_error_t median_blur_sorting_network_stripe_sc( filter); return KLEIDICV_OK; } - if (kernel_width == 7) { - MedianBlurSortingNetwork median_filter; - Filter2D7x7> filter{median_filter}; - process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, - filter); - return KLEIDICV_OK; - } - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + MedianBlurSortingNetwork median_filter; + Filter2D7x7> filter{median_filter}; + process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, + filter); + return KLEIDICV_OK; } } // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/test/api/test_median_blur.cpp b/test/api/test_median_blur.cpp index ce9c6140a..7f3326b2c 100644 --- a/test/api/test_median_blur.cpp +++ b/test/api/test_median_blur.cpp @@ -360,62 +360,53 @@ TYPED_TEST(MedianBlurTest, OversizeImage) { } TYPED_TEST(MedianBlurTest, UnsupportedFilterSizes) { - test::Array2D src{100, 100}; - test::Array2D dst{100, 100}; + test::Array2D src{1000, 1000}; + test::Array2D dst{1000, 1000}; // Test unsupported large square filter EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 100, 100, + dst.stride(), 1000, 1000, 1, 257, 257, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test non-square filter with valid height EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 100, 5, + dst.stride(), 1000, 1000, 1, 100, 5, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test non-square filter with valid width EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 5, 100, + dst.stride(), 1000, 1000, 1, 5, 100, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test unsupported small filter EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 1, 1, + dst.stride(), 1000, 1000, 1, 1, 1, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test unsupported even filter EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 4, 4, + dst.stride(), 1000, 1000, 1, 4, 4, KLEIDICV_BORDER_TYPE_REPLICATE)); // Test mid-range square filters that are not implemented EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 9, 9, + dst.stride(), 1000, 1000, 1, 9, 9, KLEIDICV_BORDER_TYPE_TRANSPARENT)); if (!std::is_same_v) { EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 9, 9, + dst.stride(), 1000, 1000, 1, 9, 9, KLEIDICV_BORDER_TYPE_REPLICATE)); } } -TYPED_TEST(MedianBlurTest, NonSquareFilterSizeWithValidWidth) { - test::Array2D src{100, 100}; - test::Array2D dst{100, 100}; - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - median_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 100, 100, 1, 5, 100, - KLEIDICV_BORDER_TYPE_REPLICATE)); -} - TYPED_TEST(MedianBlurTest, SrcDstChannelCombinations) { using ElementType = TypeParam; constexpr size_t width = 10; -- GitLab From 2187c338f7500d447220d4de8a2907f16565c3e9 Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 17:37:04 +0100 Subject: [PATCH 8/9] Fix building issue --- kleidicv/include/kleidicv/filters/median_blur.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kleidicv/include/kleidicv/filters/median_blur.h b/kleidicv/include/kleidicv/filters/median_blur.h index 70329a3e9..8fbaf2c36 100644 --- a/kleidicv/include/kleidicv/filters/median_blur.h +++ b/kleidicv/include/kleidicv/filters/median_blur.h @@ -160,7 +160,7 @@ inline kleidicv_error_t check_ptrs_strides_imagesizes(const T *src, template inline bool is_kernel_size_supported(size_t kernel_width, size_t kernel_height) { - if (std::is_same_v) { + if constexpr (std::is_same_v) { return (kernel_width == kernel_height) && (kernel_width >= 3) && (kernel_width <= 255) && ((kernel_width % 2) != 0); } else { -- GitLab From 709a1ab29d8df18fac5c7b0ee93c0cfa17559dc7 Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Thu, 3 Jul 2025 18:10:09 +0100 Subject: [PATCH 9/9] increase width in thread testing for large filter size --- test/api/test_thread.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 0dafa07ed..03649e374 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -957,7 +957,7 @@ INSTANTIATE_TEST_SUITE_P( P{1, 7, 4}, P{12, 34, 5}, P{1, 16, 1}, P{1, 32, 1}, P{1, 32, 2}, P{2, 16, 2}, P{2, 32, 1}, P{1, 48, 2}, P{2, 48, 1}, P{6, 64, 1}, P{4, 80, 2}, P{2, 96, 3}, - P{1, 112, 4}, P{12, 34, 5})); + P{1, 112, 4}, P{12, 34, 5}, P{40, 34, 5})); TEST(ThreadScaleU8, NotImplemented) { test::Array2D src(size_t{1}, 1), dst(size_t{1}, 1); -- GitLab