From a6bc85376a83f10a83b97364a84a87ab2207537d Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Fri, 30 May 2025 11:56:18 +0100 Subject: [PATCH] Add SVE version of 5x5 Median filter --- doc/opencv.md | 2 +- kleidicv/include/kleidicv/filters/filter_2d.h | 72 ++++ .../kleidicv/filters/filter_2d_5x5_base.h | 79 ++++ .../kleidicv/filters/filter_2d_5x5_neon.h | 101 +++++ .../kleidicv/filters/filter_2d_5x5_sc.h | 109 +++++ kleidicv/include/kleidicv/kleidicv.h | 7 +- kleidicv/include/kleidicv/sve2.h | 37 +- kleidicv/src/filters/median_blur_api.cpp | 32 +- kleidicv/src/filters/median_blur_neon.cpp | 396 +++--------------- kleidicv/src/filters/median_blur_sc.h | 92 ++++ kleidicv/src/filters/median_blur_sme2.cpp | 36 ++ .../filters/median_blur_sorting_network_5x5.h | 143 +++++++ kleidicv/src/filters/median_blur_sve2.cpp | 35 ++ 13 files changed, 786 insertions(+), 355 deletions(-) create mode 100644 kleidicv/include/kleidicv/filters/filter_2d.h create mode 100644 kleidicv/include/kleidicv/filters/filter_2d_5x5_base.h create mode 100644 kleidicv/include/kleidicv/filters/filter_2d_5x5_neon.h create mode 100644 kleidicv/include/kleidicv/filters/filter_2d_5x5_sc.h create mode 100644 kleidicv/src/filters/median_blur_sc.h create mode 100644 kleidicv/src/filters/median_blur_sme2.cpp create mode 100644 kleidicv/src/filters/median_blur_sorting_network_5x5.h create mode 100644 kleidicv/src/filters/median_blur_sve2.cpp diff --git a/doc/opencv.md b/doc/opencv.md index 113252fe6..2ab35dd15 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -161,7 +161,7 @@ Notes on parameters: Applies median filter to a given image. Notes on parameters: -* `src.cols`,`src.rows` - image width and height must be `>=5`. +* `src.cols`,`src.rows` - image width and height must be greater than or equal to `ksize - 1` (i.e. `>= 4` for 5x5). * `ksize` - must be 5, as KleidiCV only supports 5x5 kernel size. ### [`cv::transpose()`](https://docs.opencv.org/4.10.0/d2/de8/group__core__array.html#ga46630ed6c0ea6254a35f447289bd7404) diff --git a/kleidicv/include/kleidicv/filters/filter_2d.h b/kleidicv/include/kleidicv/filters/filter_2d.h new file mode 100644 index 000000000..cac7c71ad --- /dev/null +++ b/kleidicv/include/kleidicv/filters/filter_2d.h @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_FILTERS_FILTER_2D_H +#define KLEIDICV_FILTERS_FILTER_2D_H + +#include "kleidicv/kleidicv.h" +#include "kleidicv/types.h" +#include "kleidicv/workspace/border_types.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Primary Template for Filter 2D. +template +class Filter2D; + +template +void process_filter2d(Rectangle rect, size_t y_begin, size_t y_end, + Rows src_rows, + Rows dst_rows, + typename FilterType::BorderType border_type, + FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + // Border helper which calculates border offsets. + typename FilterType::BorderInfoType vertical_border{rect.height(), + border_type}; + typename FilterType::BorderInfoType horizontal_border{rect.width(), + border_type}; + + for (size_t vertical_index = y_begin; vertical_index < y_end; + ++vertical_index) { + auto vertical_offsets = vertical_border.offsets_with_border(vertical_index); + constexpr size_t kMargin = filter.kMargin; + + // Process data affected by left border. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t horizontal_index = 0; horizontal_index < kMargin; + ++horizontal_index) { + auto horizontal_offsets = + horizontal_border.offsets_with_left_border(horizontal_index); + filter.process_one_pixel_with_horizontal_borders( + src_rows.at(vertical_index, horizontal_index), + dst_rows.at(vertical_index, horizontal_index), vertical_offsets, + horizontal_offsets); + } + + // Process data which is not affected by any borders in bulk. + size_t width_without_borders = rect.width() - (2 * kMargin); + auto horizontal_offsets = horizontal_border.offsets_without_border(); + filter.process_pixels_without_horizontal_borders( + width_without_borders, src_rows.at(vertical_index, kMargin), + dst_rows.at(vertical_index, kMargin), vertical_offsets, + horizontal_offsets); + + // Process data affected by right border. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t horizontal_index = 0; horizontal_index < kMargin; + ++horizontal_index) { + size_t index = rect.width() - kMargin + horizontal_index; + auto horizontal_offsets = + horizontal_border.offsets_with_right_border(index); + filter.process_one_pixel_with_horizontal_borders( + src_rows.at(vertical_index, index), + dst_rows.at(vertical_index, index), vertical_offsets, + horizontal_offsets); + } + } +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_FILTERS_FILTER_2D_H diff --git a/kleidicv/include/kleidicv/filters/filter_2d_5x5_base.h b/kleidicv/include/kleidicv/filters/filter_2d_5x5_base.h new file mode 100644 index 000000000..8f4d92921 --- /dev/null +++ b/kleidicv/include/kleidicv/filters/filter_2d_5x5_base.h @@ -0,0 +1,79 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_FILTER_2D_5X5_BASE_H +#define KLEIDICV_FILTER_2D_5X5_BASE_H + +#include "kleidicv/workspace/border_5x5.h" + +namespace KLEIDICV_TARGET_NAMESPACE { +template +class Filter2D5x5Base { + protected: + using BorderInfoType = + typename KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderOffsets = typename BorderInfoType::Offsets; + + template + void load_window(KernelWindowFunctor& KernelWindow, + LoadArrayElementFunctionType load_array_element, + Rows src_rows, + BorderOffsets window_row_offsets, + BorderOffsets window_col_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + KernelWindow(0, 0) = load_array_element( + src_rows.at(window_row_offsets.c0(), window_col_offsets.c0())[index]); + KernelWindow(0, 1) = load_array_element( + src_rows.at(window_row_offsets.c0(), window_col_offsets.c1())[index]); + KernelWindow(0, 2) = load_array_element( + src_rows.at(window_row_offsets.c0(), window_col_offsets.c2())[index]); + KernelWindow(0, 3) = load_array_element( + src_rows.at(window_row_offsets.c0(), window_col_offsets.c3())[index]); + KernelWindow(0, 4) = load_array_element( + src_rows.at(window_row_offsets.c0(), window_col_offsets.c4())[index]); + KernelWindow(1, 0) = load_array_element( + src_rows.at(window_row_offsets.c1(), window_col_offsets.c0())[index]); + KernelWindow(1, 1) = load_array_element( + src_rows.at(window_row_offsets.c1(), window_col_offsets.c1())[index]); + KernelWindow(1, 2) = load_array_element( + src_rows.at(window_row_offsets.c1(), window_col_offsets.c2())[index]); + KernelWindow(1, 3) = load_array_element( + src_rows.at(window_row_offsets.c1(), window_col_offsets.c3())[index]); + KernelWindow(1, 4) = load_array_element( + src_rows.at(window_row_offsets.c1(), window_col_offsets.c4())[index]); + KernelWindow(2, 0) = load_array_element( + src_rows.at(window_row_offsets.c2(), window_col_offsets.c0())[index]); + KernelWindow(2, 1) = load_array_element( + src_rows.at(window_row_offsets.c2(), window_col_offsets.c1())[index]); + KernelWindow(2, 2) = load_array_element( + src_rows.at(window_row_offsets.c2(), window_col_offsets.c2())[index]); + KernelWindow(2, 3) = load_array_element( + src_rows.at(window_row_offsets.c2(), window_col_offsets.c3())[index]); + KernelWindow(2, 4) = load_array_element( + src_rows.at(window_row_offsets.c2(), window_col_offsets.c4())[index]); + KernelWindow(3, 0) = load_array_element( + src_rows.at(window_row_offsets.c3(), window_col_offsets.c0())[index]); + KernelWindow(3, 1) = load_array_element( + src_rows.at(window_row_offsets.c3(), window_col_offsets.c1())[index]); + KernelWindow(3, 2) = load_array_element( + src_rows.at(window_row_offsets.c3(), window_col_offsets.c2())[index]); + KernelWindow(3, 3) = load_array_element( + src_rows.at(window_row_offsets.c3(), window_col_offsets.c3())[index]); + KernelWindow(3, 4) = load_array_element( + src_rows.at(window_row_offsets.c3(), window_col_offsets.c4())[index]); + KernelWindow(4, 0) = load_array_element( + src_rows.at(window_row_offsets.c4(), window_col_offsets.c0())[index]); + KernelWindow(4, 1) = load_array_element( + src_rows.at(window_row_offsets.c4(), window_col_offsets.c1())[index]); + KernelWindow(4, 2) = load_array_element( + src_rows.at(window_row_offsets.c4(), window_col_offsets.c2())[index]); + KernelWindow(4, 3) = load_array_element( + src_rows.at(window_row_offsets.c4(), window_col_offsets.c3())[index]); + KernelWindow(4, 4) = load_array_element( + src_rows.at(window_row_offsets.c4(), window_col_offsets.c4())[index]); + } +}; +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_FILTER_2D_5X5_BASE_H diff --git a/kleidicv/include/kleidicv/filters/filter_2d_5x5_neon.h b/kleidicv/include/kleidicv/filters/filter_2d_5x5_neon.h new file mode 100644 index 000000000..b08bd114d --- /dev/null +++ b/kleidicv/include/kleidicv/filters/filter_2d_5x5_neon.h @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_FILTER_2D_5X5_NEON_H +#define KLEIDICV_FILTER_2D_5X5_NEON_H + +#include "filter_2d.h" +#include "filter_2d_5x5_base.h" +#include "kleidicv/neon.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for Filter2D 5x5. +template +class Filter2D + : public Filter2D5x5Base { + public: + using SourceType = typename InnerFilterType::SourceType; + using DestinationType = typename InnerFilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using Base = Filter2D5x5Base; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + static constexpr size_t kMargin = 2UL; + explicit Filter2D(InnerFilterType filter) : filter_{filter} {} + + void process_pixels_without_horizontal_borders( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets window_row_offsets, + BorderOffsets window_col_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[5][5]; + SourceVectorType dst_vec; + + auto KernelWindow = [&](size_t row, size_t col) -> SourceVectorType& { + return src[row][col]; + }; + + auto load_array_element = [](const SourceType& x) { return vld1q(&x); }; + Base::load_window(KernelWindow, load_array_element, src_rows, + window_row_offsets, window_col_offsets, index); + filter_.vector_path(KernelWindow, dst_vec); + + vst1q(&dst_rows[index], dst_vec); + }); + + loop.tail([&](size_t index) { + process_one_element_with_horizontal_borders( + src_rows, dst_rows, window_row_offsets, window_col_offsets, index); + }); + } + + void process_one_pixel_with_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets window_row_offsets, + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_one_element_with_horizontal_borders( + src_rows, dst_rows, window_row_offsets, window_col_offsets, index); + } + } + + private: + void process_one_element_with_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceType src[5][5]; + + auto KernelWindow = [&](size_t row, size_t col) + KLEIDICV_STREAMING_COMPATIBLE -> SourceType& { + return src[row][col]; + }; + + auto load_array_element = [&](const SourceType& x) + KLEIDICV_STREAMING_COMPATIBLE { return x; }; + + Base::load_window(KernelWindow, load_array_element, src_rows, + window_row_offsets, window_col_offsets, index); + + filter_.scalar_path(KernelWindow, dst_rows[index]); + } + + InnerFilterType filter_; +}; // end of class Filter2D + +// Shorthand for 5x5 2D filters driver type. +template +using Filter2D5x5 = Filter2D; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_FILTER_2D_5X5_NEON_H diff --git a/kleidicv/include/kleidicv/filters/filter_2d_5x5_sc.h b/kleidicv/include/kleidicv/filters/filter_2d_5x5_sc.h new file mode 100644 index 000000000..847433a8e --- /dev/null +++ b/kleidicv/include/kleidicv/filters/filter_2d_5x5_sc.h @@ -0,0 +1,109 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_FILTER_2D_5X5_SC_H +#define KLEIDICV_FILTER_2D_5X5_SC_H + +#include "filter_2d.h" +#include "filter_2d_5x5_base.h" +#include "kleidicv/sve2.h" + +namespace KLEIDICV_TARGET_NAMESPACE { +// Template for Filter2D 5x5. +template +class Filter2D + : public Filter2D5x5Base { + public: + using SourceType = typename InnerFilterType::SourceType; + using DestinationType = typename InnerFilterType::DestinationType; + using SourceVecTraits = + typename KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BorderInfoType = + typename KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + using Base = Filter2D5x5Base; + static constexpr size_t kMargin = 2UL; + explicit Filter2D(InnerFilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + void process_pixels_without_horizontal_borders( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets window_row_offsets, + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svptrue(); + process_elements_with_vector_operation(src_rows, dst_rows, + window_row_offsets, + window_col_offsets, index, pg); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + process_elements_with_vector_operation(src_rows, dst_rows, + window_row_offsets, + window_col_offsets, index, pg); + }); + } + + void process_one_pixel_with_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets window_row_offsets, + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + process_elements_with_vector_operation( + src_rows, dst_rows, window_row_offsets, window_col_offsets, index, + SourceVecTraits::template svptrue_pat()); + } + } + + private: + void process_elements_with_vector_operation( + Rows src_rows, Rows dst_rows, + BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, + size_t index, svbool_t pg) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_1_0, + src_1_1, src_1_2, src_1_3, src_1_4, src_2_0, src_2_1, src_2_2, src_2_3, + src_2_4, src_3_0, src_3_1, src_3_2, src_3_3, src_3_4, src_4_0, src_4_1, + src_4_2, src_4_3, src_4_4, output_vector; + + // Initialization + ScalableVectorArray2D KernelWindow = {{ + {std::ref(src_0_0), std::ref(src_0_1), std::ref(src_0_2), + std::ref(src_0_3), std::ref(src_0_4)}, + {std::ref(src_1_0), std::ref(src_1_1), std::ref(src_1_2), + std::ref(src_1_3), std::ref(src_1_4)}, + {std::ref(src_2_0), std::ref(src_2_1), std::ref(src_2_2), + std::ref(src_2_3), std::ref(src_2_4)}, + {std::ref(src_3_0), std::ref(src_3_1), std::ref(src_3_2), + std::ref(src_3_3), std::ref(src_3_4)}, + {std::ref(src_4_0), std::ref(src_4_1), std::ref(src_4_2), + std::ref(src_4_3), std::ref(src_4_4)}, + }}; + + auto load_array_element = + [&](const SourceType& x) + KLEIDICV_STREAMING_COMPATIBLE { return svld1(pg, &x); }; + + Base::load_window(KernelWindow, load_array_element, src_rows, + window_row_offsets, window_col_offsets, index); + filter_.vector_path(KernelWindow, output_vector, pg); + svst1(pg, &dst_rows[index], output_vector); + } + + InnerFilterType filter_; +}; // end of class Filter2D + +// Shorthand for 5x5 2D filters driver type. +template +using Filter2D5x5 = Filter2D; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_FILTER_2D_5X5_SC_H diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index e9fd15058..32f0ebc37 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -2035,9 +2035,10 @@ kleidicv_error_t kleidicv_warp_perspective_u8( /// `width * sizeof(type) * channels`, except for /// single-row images. /// @param width Number of columns in the data. (One column consists of -/// `channels` number of elements.) Must be greater -/// than 4. -/// @param height Number of rows in the data. Must be greater than 4. +/// `channels` number of elements.) Must be greater than +/// or equal to `kernel_width - 1`. +/// @param height Number of rows in the data. Must be greater than or +/// equal to `kernel_height - 1`. /// @param channels Number of channels in the data. Must not be more than /// @ref KLEIDICV_MAXIMUM_CHANNEL_COUNT. /// @param kernel_width Width of the Median kernel. Must be 5 and equal to diff --git a/kleidicv/include/kleidicv/sve2.h b/kleidicv/include/kleidicv/sve2.h index ebbc5d04e..33fd117a4 100644 --- a/kleidicv/include/kleidicv/sve2.h +++ b/kleidicv/include/kleidicv/sve2.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -251,6 +251,30 @@ class VecTraitsBase : public VectorTypes { return svptrue_b64(); } + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b8(pat); + } + + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b16(pat); + } + + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b32(pat); + } + + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b64(pat); + } + template static std::enable_if_t svwhilelt( IndexType index, IndexType max_index) KLEIDICV_STREAMING_COMPATIBLE { @@ -561,6 +585,17 @@ static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING_COMPATIBLE { b = tmp; } +// The following wrapper is used as a workaround to treat SVE variables as a 2D +// array. +template +class ScalableVectorArray2D { + public: + std::reference_wrapper window[Rows][Cols]; + VectorType &operator()(int row, int col) KLEIDICV_STREAMING_COMPATIBLE { + return window[row][col].get(); + } +}; + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_SVE2_H diff --git a/kleidicv/src/filters/median_blur_api.cpp b/kleidicv/src/filters/median_blur_api.cpp index 8263f7c95..a95007df7 100644 --- a/kleidicv/src/filters/median_blur_api.cpp +++ b/kleidicv/src/filters/median_blur_api.cpp @@ -19,11 +19,37 @@ kleidicv_error_t median_blur_stripe(const T *src, size_t src_stride, T *dst, } // namespace neon +namespace sve2 { + +template +kleidicv_error_t median_blur_stripe(const T *src, size_t src_stride, T *dst, + size_t dst_stride, size_t width, + size_t height, size_t y_begin, size_t y_end, + size_t channels, size_t kernel_width, + size_t kernel_height, + FixedBorderType border_type); + +} // namespace sve2 + +namespace sme2 { + +template +kleidicv_error_t median_blur_stripe(const T *src, size_t src_stride, T *dst, + size_t dst_stride, size_t width, + size_t height, size_t y_begin, size_t y_end, + size_t channels, size_t kernel_width, + size_t kernel_height, + FixedBorderType border_type); + +} // namespace sme2 + } // namespace kleidicv -#define KLEIDICV_DEFINE_C_API(name, type) \ - KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::median_blur_stripe, \ - nullptr, nullptr) +#define KLEIDICV_DEFINE_C_API(name, type) \ + KLEIDICV_MULTIVERSION_C_API( \ + name, &kleidicv::neon::median_blur_stripe, \ + KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::median_blur_stripe), \ + &kleidicv::sme2::median_blur_stripe) KLEIDICV_DEFINE_C_API(kleidicv_median_blur_stripe_s8, int8_t); KLEIDICV_DEFINE_C_API(kleidicv_median_blur_stripe_u8, uint8_t); diff --git a/kleidicv/src/filters/median_blur_neon.cpp b/kleidicv/src/filters/median_blur_neon.cpp index 769873646..33b67012c 100644 --- a/kleidicv/src/filters/median_blur_neon.cpp +++ b/kleidicv/src/filters/median_blur_neon.cpp @@ -3,10 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/ctypes.h" +#include "kleidicv/filters/filter_2d.h" +#include "kleidicv/filters/filter_2d_5x5_neon.h" #include "kleidicv/filters/median_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" #include "kleidicv/workspace/border_5x5.h" +#include "median_blur_sorting_network_5x5.h" namespace kleidicv::neon { @@ -14,378 +17,77 @@ namespace kleidicv::neon { template class MedianBlur; -// Template for Median Blur 5x5 filters. template -class MedianBlur { - public: - using SourceType = ScalarType; - using DestinationType = SourceType; - using SourceVectorType = typename VecTraits::VectorType; - using DestinationVectorType = typename VecTraits::VectorType; - - void vector_path(SourceVectorType window[5][5], - DestinationVectorType& dst) const { - sorting_network5x5(window, dst); - } - - void scalar_path(SourceType window[5][5], DestinationType& dst) const { - sorting_network5x5(window, dst); - } - - private: +class MedianBlurBase { + protected: class vectorized_comparator { public: + using SourceVectorType = typename VecTraits::VectorType; + static void compare_and_swap(SourceVectorType& left, - SourceVectorType& right) { - SourceVectorType tmp_left = vmaxq(left, right); - SourceVectorType tmp_right = vminq(left, right); - left = tmp_left; - right = tmp_right; + SourceVectorType& right, Monostate&) { + SourceVectorType max_value = vmaxq(left, right); + SourceVectorType min_value = vminq(left, right); + left = min_value; + right = max_value; } - static void min(SourceVectorType& left, SourceVectorType& right) { - right = vminq(left, right); + static void min(SourceVectorType& left, SourceVectorType& right, + Monostate&) { + left = vminq(left, right); } - static void max(SourceVectorType& left, SourceVectorType& right) { - left = vmaxq(left, right); + static void max(SourceVectorType& left, SourceVectorType& right, + Monostate&) { + right = vmaxq(left, right); } }; - class scalar_comparator { public: - static void compare_and_swap(ScalarType& left, ScalarType& right) { - if (left < right) { + static void compare_and_swap(ScalarType& left, ScalarType& right, + Monostate&) { + if (left > right) { std::swap(left, right); } } - static void min(ScalarType& left, ScalarType& right) { - right = std::min(left, right); + static void min(ScalarType& left, ScalarType& right, Monostate&) { + left = std::min(left, right); } - static void max(ScalarType& left, ScalarType& right) { - left = std::max(left, right); + static void max(ScalarType& left, ScalarType& right, Monostate&) { + right = std::max(left, right); } }; +}; - // R. B. Kent and M. S. Pattichis, ''Design of high-speed multiway merge - // sorting networks using fast single-stage N-sorters and N-filters,'' *IEEE - // Access*, vol. 10, pp. 79565–79581, Jul. 2022, - // doi: 10.1109/ACCESS.2022.3193370. The paper is currently available at: - // https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9837930 - template - void sorting_network5x5(T window[5][5], T& dst) const { - // full sort col - // col 0 - Comparator::compare_and_swap(window[0][0], window[3][0]); - Comparator::compare_and_swap(window[1][0], window[4][0]); - Comparator::compare_and_swap(window[0][0], window[2][0]); - Comparator::compare_and_swap(window[1][0], window[3][0]); - Comparator::compare_and_swap(window[0][0], window[1][0]); - Comparator::compare_and_swap(window[2][0], window[4][0]); - Comparator::compare_and_swap(window[1][0], window[2][0]); - Comparator::compare_and_swap(window[3][0], window[4][0]); - Comparator::compare_and_swap(window[2][0], window[3][0]); - // col 1 - Comparator::compare_and_swap(window[0][1], window[3][1]); - Comparator::compare_and_swap(window[1][1], window[4][1]); - Comparator::compare_and_swap(window[0][1], window[2][1]); - Comparator::compare_and_swap(window[1][1], window[3][1]); - Comparator::compare_and_swap(window[0][1], window[1][1]); - Comparator::compare_and_swap(window[2][1], window[4][1]); - Comparator::compare_and_swap(window[1][1], window[2][1]); - Comparator::compare_and_swap(window[3][1], window[4][1]); - Comparator::compare_and_swap(window[2][1], window[3][1]); - // col 2 - Comparator::compare_and_swap(window[0][2], window[3][2]); - Comparator::compare_and_swap(window[1][2], window[4][2]); - Comparator::compare_and_swap(window[0][2], window[2][2]); - Comparator::compare_and_swap(window[1][2], window[3][2]); - Comparator::compare_and_swap(window[0][2], window[1][2]); - Comparator::compare_and_swap(window[2][2], window[4][2]); - Comparator::compare_and_swap(window[1][2], window[2][2]); - Comparator::compare_and_swap(window[3][2], window[4][2]); - Comparator::compare_and_swap(window[2][2], window[3][2]); - // col 3 - Comparator::compare_and_swap(window[0][3], window[3][3]); - Comparator::compare_and_swap(window[1][3], window[4][3]); - Comparator::compare_and_swap(window[0][3], window[2][3]); - Comparator::compare_and_swap(window[1][3], window[3][3]); - Comparator::compare_and_swap(window[0][3], window[1][3]); - Comparator::compare_and_swap(window[2][3], window[4][3]); - Comparator::compare_and_swap(window[1][3], window[2][3]); - Comparator::compare_and_swap(window[3][3], window[4][3]); - Comparator::compare_and_swap(window[2][3], window[3][3]); - // col 4 - Comparator::compare_and_swap(window[0][4], window[3][4]); - Comparator::compare_and_swap(window[1][4], window[4][4]); - Comparator::compare_and_swap(window[0][4], window[2][4]); - Comparator::compare_and_swap(window[1][4], window[3][4]); - Comparator::compare_and_swap(window[0][4], window[1][4]); - Comparator::compare_and_swap(window[2][4], window[4][4]); - Comparator::compare_and_swap(window[1][4], window[2][4]); - Comparator::compare_and_swap(window[3][4], window[4][4]); - Comparator::compare_and_swap(window[2][4], window[3][4]); - // partialy sort row - // sort row zero for only element 3 and 4 - Comparator::compare_and_swap(window[0][0], window[0][3]); - Comparator::compare_and_swap(window[0][1], window[0][4]); - Comparator::compare_and_swap(window[0][0], window[0][2]); - Comparator::compare_and_swap(window[0][1], window[0][3]); - Comparator::min(window[0][0], window[0][1]); - Comparator::compare_and_swap(window[0][2], window[0][4]); - Comparator::min(window[0][1], window[0][2]); - Comparator::compare_and_swap(window[0][3], window[0][4]); - Comparator::min(window[0][2], window[0][3]); - // sort row 1 for only element {2, 3, 4} - Comparator::compare_and_swap(window[1][0], window[1][3]); - Comparator::compare_and_swap(window[1][1], window[1][4]); - Comparator::compare_and_swap(window[1][0], window[1][2]); - Comparator::compare_and_swap(window[1][1], window[1][3]); - Comparator::min(window[1][0], window[1][1]); - Comparator::compare_and_swap(window[1][2], window[1][4]); - Comparator::min(window[1][1], window[1][2]); - Comparator::compare_and_swap(window[1][3], window[1][4]); - Comparator::compare_and_swap(window[1][2], window[1][3]); - // sort row 2 {1, 2, 3} - Comparator::compare_and_swap(window[2][0], window[2][3]); - Comparator::compare_and_swap(window[2][1], window[2][4]); - Comparator::compare_and_swap(window[2][0], window[2][2]); - Comparator::compare_and_swap(window[2][1], window[2][3]); - Comparator::min(window[2][0], window[2][1]); - Comparator::compare_and_swap(window[2][2], window[2][4]); - Comparator::compare_and_swap(window[2][1], window[2][2]); - Comparator::max(window[2][3], window[2][4]); - Comparator::compare_and_swap(window[2][2], window[2][3]); - // sort row 3 - Comparator::compare_and_swap(window[3][0], window[3][3]); - Comparator::compare_and_swap(window[3][1], window[3][4]); - Comparator::compare_and_swap(window[3][0], window[3][2]); - Comparator::compare_and_swap(window[3][1], window[3][3]); - Comparator::compare_and_swap(window[3][0], window[3][1]); - Comparator::compare_and_swap(window[3][2], window[3][4]); - Comparator::compare_and_swap(window[3][1], window[3][2]); - Comparator::max(window[3][3], window[3][4]); - Comparator::max(window[3][2], window[3][3]); - // sort row 4 - Comparator::compare_and_swap(window[4][0], window[4][3]); - Comparator::compare_and_swap(window[4][1], window[4][4]); - Comparator::compare_and_swap(window[4][0], window[4][2]); - Comparator::max(window[4][1], window[4][3]); - Comparator::compare_and_swap(window[4][0], window[4][1]); - Comparator::max(window[4][2], window[4][4]); - Comparator::max(window[4][1], window[4][2]); - // partialy sort digonal - // sort dig 0 - Comparator::min(window[0][3], window[2][1]); - Comparator::min(window[1][2], window[3][0]); - Comparator::min(window[2][1], window[3][0]); - // sort dig 1 - Comparator::compare_and_swap(window[0][4], window[3][1]); - Comparator::compare_and_swap(window[1][3], window[4][0]); - Comparator::compare_and_swap(window[0][4], window[2][2]); - Comparator::compare_and_swap(window[1][3], window[3][1]); - Comparator::min(window[0][4], window[1][3]); - Comparator::compare_and_swap(window[2][2], window[4][0]); - Comparator::min(window[1][3], window[2][2]); - Comparator::max(window[3][1], window[4][0]); - Comparator::max(window[2][2], window[3][1]); - // sort dig 2 - Comparator::max(window[1][4], window[3][2]); - Comparator::max(window[2][3], window[4][1]); - Comparator::max(window[1][4], window[2][3]); - // find median - Comparator::compare_and_swap(window[1][4], window[3][0]); - Comparator::min(window[1][4], window[2][2]); - Comparator::max(window[2][2], window[3][0]); - dst = window[2][2]; - } -}; // end of class MedianBlur - -// Primary Template for Filter 2D. -template -class Filter2D; - -// Template for Filter2D 5x5. -template -class Filter2D { +// Template for Median Blur 5x5 filters. +template +class MedianBlur : public MedianBlurBase { public: - using SourceType = typename FilterType::SourceType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - static constexpr size_t margin = 2UL; - explicit Filter2D(FilterType filter) : filter_{filter} {} - - void process_pixels_without_horizontal_borders( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets window_row_offsets, - BorderOffsets window_col_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[5][5]; - SourceVectorType dst_vec; - - auto load_array_element = [](const SourceType& x) { return vld1q(&x); }; - load_window(src, load_array_element, src_rows, window_row_offsets, - window_col_offsets, index); - filter_.vector_path(src, dst_vec); - - vst1q(&dst_rows[index], dst_vec); - }); - - loop.tail([&](size_t index) { - process_one_element_with_or_without_horizontal_borders( - src_rows, dst_rows, window_row_offsets, window_col_offsets, index); - }); - } - - void process_one_pixel_with_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets window_row_offsets, - BorderOffsets window_col_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_one_element_with_or_without_horizontal_borders( - src_rows, dst_rows, window_row_offsets, window_col_offsets, index); - } - } - - private: - template - void load_window(T src[5][5], LoadArrayElementFunctionType load_array_element, - Rows src_rows, - BorderOffsets window_row_offsets, - BorderOffsets window_col_offsets, size_t index) const { - src[0][0] = load_array_element( - src_rows.at(window_row_offsets.c0(), window_col_offsets.c0())[index]); - src[0][1] = load_array_element( - src_rows.at(window_row_offsets.c0(), window_col_offsets.c1())[index]); - src[0][2] = load_array_element( - src_rows.at(window_row_offsets.c0(), window_col_offsets.c2())[index]); - src[0][3] = load_array_element( - src_rows.at(window_row_offsets.c0(), window_col_offsets.c3())[index]); - src[0][4] = load_array_element( - src_rows.at(window_row_offsets.c0(), window_col_offsets.c4())[index]); - src[1][0] = load_array_element( - src_rows.at(window_row_offsets.c1(), window_col_offsets.c0())[index]); - src[1][1] = load_array_element( - src_rows.at(window_row_offsets.c1(), window_col_offsets.c1())[index]); - src[1][2] = load_array_element( - src_rows.at(window_row_offsets.c1(), window_col_offsets.c2())[index]); - src[1][3] = load_array_element( - src_rows.at(window_row_offsets.c1(), window_col_offsets.c3())[index]); - src[1][4] = load_array_element( - src_rows.at(window_row_offsets.c1(), window_col_offsets.c4())[index]); - src[2][0] = load_array_element( - src_rows.at(window_row_offsets.c2(), window_col_offsets.c0())[index]); - src[2][1] = load_array_element( - src_rows.at(window_row_offsets.c2(), window_col_offsets.c1())[index]); - src[2][2] = load_array_element( - src_rows.at(window_row_offsets.c2(), window_col_offsets.c2())[index]); - src[2][3] = load_array_element( - src_rows.at(window_row_offsets.c2(), window_col_offsets.c3())[index]); - src[2][4] = load_array_element( - src_rows.at(window_row_offsets.c2(), window_col_offsets.c4())[index]); - src[3][0] = load_array_element( - src_rows.at(window_row_offsets.c3(), window_col_offsets.c0())[index]); - src[3][1] = load_array_element( - src_rows.at(window_row_offsets.c3(), window_col_offsets.c1())[index]); - src[3][2] = load_array_element( - src_rows.at(window_row_offsets.c3(), window_col_offsets.c2())[index]); - src[3][3] = load_array_element( - src_rows.at(window_row_offsets.c3(), window_col_offsets.c3())[index]); - src[3][4] = load_array_element( - src_rows.at(window_row_offsets.c3(), window_col_offsets.c4())[index]); - src[4][0] = load_array_element( - src_rows.at(window_row_offsets.c4(), window_col_offsets.c0())[index]); - src[4][1] = load_array_element( - src_rows.at(window_row_offsets.c4(), window_col_offsets.c1())[index]); - src[4][2] = load_array_element( - src_rows.at(window_row_offsets.c4(), window_col_offsets.c2())[index]); - src[4][3] = load_array_element( - src_rows.at(window_row_offsets.c4(), window_col_offsets.c3())[index]); - src[4][4] = load_array_element( - src_rows.at(window_row_offsets.c4(), window_col_offsets.c4())[index]); - } - - void process_one_element_with_or_without_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index) const { - SourceType src[5][5]; - - auto load_array_element = [](const SourceType& x) { return x; }; - load_window(src, load_array_element, src_rows, window_row_offsets, - window_col_offsets, index); - - filter_.scalar_path(src, dst_rows[index]); + using SourceType = ScalarType; + using DestinationType = SourceType; + using SourceVectorType = typename VecTraits::VectorType; + using DestinationVectorType = typename VecTraits::VectorType; + using VectorComparator = + typename MedianBlurBase::vectorized_comparator; + using ScalarComparator = + typename MedianBlurBase::scalar_comparator; + + template + void vector_path(KernelWindowFunctor& KernelWindow, + DestinationVectorType& output_vec) const { + Monostate ctx; + sorting_network5x5(KernelWindow, output_vec, ctx); } - FilterType filter_; -}; // end of class Filter2D - -template -void process_filter2d(Rectangle rect, size_t y_begin, size_t y_end, - Rows src_rows, - Rows dst_rows, - typename FilterType::BorderType border_type, - FilterType filter) { - // Border helper which calculates border offsets. - typename FilterType::BorderInfoType vertical_border{rect.height(), - border_type}; - typename FilterType::BorderInfoType horizontal_border{rect.width(), - border_type}; - - for (size_t vertical_index = y_begin; vertical_index < y_end; - ++vertical_index) { - auto vertical_offsets = vertical_border.offsets_with_border(vertical_index); - constexpr size_t margin = filter.margin; - - // Process data affected by left border. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t horizontal_index = 0; horizontal_index < margin; - ++horizontal_index) { - auto horizontal_offsets = - horizontal_border.offsets_with_left_border(horizontal_index); - filter.process_one_pixel_with_horizontal_borders( - src_rows.at(vertical_index, horizontal_index), - dst_rows.at(vertical_index, horizontal_index), vertical_offsets, - horizontal_offsets); - } - // Process data which is not affected by any borders in bulk. - size_t width_without_borders = rect.width() - (2 * margin); - auto horizontal_offsets = horizontal_border.offsets_without_border(); - filter.process_pixels_without_horizontal_borders( - width_without_borders, src_rows.at(vertical_index, margin), - dst_rows.at(vertical_index, margin), vertical_offsets, - horizontal_offsets); - - // Process data affected by right border. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t horizontal_index = 0; horizontal_index < margin; - ++horizontal_index) { - size_t index = rect.width() - margin + horizontal_index; - auto horizontal_offsets = - horizontal_border.offsets_with_right_border(index); - filter.process_one_pixel_with_horizontal_borders( - src_rows.at(vertical_index, index), - dst_rows.at(vertical_index, index), vertical_offsets, - horizontal_offsets); - } + template + void scalar_path(KernelWindowFunctor& KernelWindow, + DestinationType& dst) const { + Monostate ctx; + sorting_network5x5(KernelWindow, dst, ctx); } -} - -// Shorthand for 5x5 2D filters driver type. -template -using Filter2D5x5 = Filter2D; +}; // end of class MedianBlur template kleidicv_error_t median_blur_stripe(const T* src, size_t src_stride, T* dst, diff --git a/kleidicv/src/filters/median_blur_sc.h b/kleidicv/src/filters/median_blur_sc.h new file mode 100644 index 000000000..dbb9bc2a5 --- /dev/null +++ b/kleidicv/src/filters/median_blur_sc.h @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_MEDIAN_BLUR_SC_H +#define KLEIDICV_MEDIAN_BLUR_SC_H + +#include +#include + +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/filter_2d.h" +#include "kleidicv/filters/filter_2d_5x5_sc.h" +#include "kleidicv/filters/median_blur.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_5x5.h" +#include "median_blur_sorting_network_5x5.h" +namespace KLEIDICV_TARGET_NAMESPACE { + +// Primary template for Median Blur filters. +template +class MedianBlur; + +template +class MedianBlurBase { + protected: + class vectorized_comparator { + public: + using SourceVectorType = typename VecTraits::VectorType; + + static void compare_and_swap(SourceVectorType& left, + SourceVectorType& right, + svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType max_value = svmax_m(pg, left, right); + SourceVectorType min_value = svmin_m(pg, left, right); + left = min_value; + right = max_value; + } + + static void min(SourceVectorType& left, SourceVectorType& right, + svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + left = svmin_m(pg, left, right); + } + + static void max(SourceVectorType& left, SourceVectorType& right, + svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + right = svmax_m(pg, left, right); + } + }; +}; + +// Template for Median Blur 5x5 filters. +template +class MedianBlur : public MedianBlurBase { + public: + using SourceType = ScalarType; + using DestinationType = SourceType; + using SourceVecTraits = + typename KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using DestinationVectorType = typename KLEIDICV_TARGET_NAMESPACE::VecTraits< + DestinationType>::VectorType; + using VectorComparator = + typename MedianBlurBase::vectorized_comparator; + template + void vector_path(KernelWindowFunctor& KernelWindow, + DestinationVectorType& output_vec, + svbool_t& pg) const KLEIDICV_STREAMING_COMPATIBLE { + sorting_network5x5(KernelWindow, output_vec, pg); + } +}; // end of class MedianBlur + +template +kleidicv_error_t median_blur_stripe_sc( + const T* src, size_t src_stride, T* dst, size_t dst_stride, size_t width, + size_t height, size_t y_begin, size_t y_end, size_t channels, + [[maybe_unused]] size_t kernel_width, [[maybe_unused]] size_t kernel_height, + FixedBorderType border_type) KLEIDICV_STREAMING_COMPATIBLE { + Rectangle rect{width, height}; + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + MedianBlur median_filter; + Filter2D5x5> filter{median_filter}; + process_filter2d(rect, y_begin, y_end, src_rows, dst_rows, border_type, + filter); + return KLEIDICV_OK; +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_MEDIAN_BLUR_SC_H diff --git a/kleidicv/src/filters/median_blur_sme2.cpp b/kleidicv/src/filters/median_blur_sme2.cpp new file mode 100644 index 000000000..f946540b8 --- /dev/null +++ b/kleidicv/src/filters/median_blur_sme2.cpp @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/filters/median_blur.h" +#include "median_blur_sc.h" + +namespace kleidicv::sme2 { + +template +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +median_blur_stripe(const T* src, size_t src_stride, T* dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, + size_t channels, size_t kernel_width, size_t kernel_height, + FixedBorderType border_type) { + return median_blur_stripe_sc(src, src_stride, dst, dst_stride, width, height, + y_begin, y_end, channels, kernel_width, + kernel_height, border_type); +} + +#define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_stripe( \ + const type* src, size_t src_stride, type* dst, size_t dst_stride, \ + size_t width, size_t height, size_t y_begin, size_t y_end, \ + size_t channels, size_t kernel_width, size_t kernel_height, \ + FixedBorderType border_type) + +KLEIDICV_INSTANTIATE_TEMPLATE(int8_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int16_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint16_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int32_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint32_t); +KLEIDICV_INSTANTIATE_TEMPLATE(float); + +} // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/median_blur_sorting_network_5x5.h b/kleidicv/src/filters/median_blur_sorting_network_5x5.h new file mode 100644 index 000000000..9622d6c5a --- /dev/null +++ b/kleidicv/src/filters/median_blur_sorting_network_5x5.h @@ -0,0 +1,143 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_MEDIAN_BLUR_SORTING_NETWORK_5X5_H +#define KLEIDICV_MEDIAN_BLUR_SORTING_NETWORK_5X5_H + +#include + +#include "kleidicv/kleidicv.h" +#include "kleidicv/traits.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// R. B. Kent and M. S. Pattichis, ''Design of high-speed multiway merge +// sorting networks using fast single-stage N-sorters and N-filters,'' *IEEE +// Access*, vol. 10, pp. 79565–79581, Jul. 2022, +// doi: 10.1109/ACCESS.2022.3193370. The paper is currently available at: +// https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9837930 +template +void sorting_network5x5(KernelWindowFunctor& KernelWindow, T& output_vec, + ContextType& context) KLEIDICV_STREAMING_COMPATIBLE { + Comparator::compare_and_swap(KernelWindow(3, 0), KernelWindow(0, 0), context); + Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(1, 0), context); + Comparator::compare_and_swap(KernelWindow(2, 0), KernelWindow(0, 0), context); + Comparator::compare_and_swap(KernelWindow(3, 0), KernelWindow(1, 0), context); + Comparator::compare_and_swap(KernelWindow(1, 0), KernelWindow(0, 0), context); + Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(2, 0), context); + Comparator::compare_and_swap(KernelWindow(2, 0), KernelWindow(1, 0), context); + Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(3, 0), context); + Comparator::compare_and_swap(KernelWindow(3, 0), KernelWindow(2, 0), context); + Comparator::compare_and_swap(KernelWindow(3, 1), KernelWindow(0, 1), context); + Comparator::compare_and_swap(KernelWindow(4, 1), KernelWindow(1, 1), context); + Comparator::compare_and_swap(KernelWindow(2, 1), KernelWindow(0, 1), context); + Comparator::compare_and_swap(KernelWindow(3, 1), KernelWindow(1, 1), context); + Comparator::compare_and_swap(KernelWindow(1, 1), KernelWindow(0, 1), context); + Comparator::compare_and_swap(KernelWindow(4, 1), KernelWindow(2, 1), context); + Comparator::compare_and_swap(KernelWindow(2, 1), KernelWindow(1, 1), context); + Comparator::compare_and_swap(KernelWindow(4, 1), KernelWindow(3, 1), context); + Comparator::compare_and_swap(KernelWindow(3, 1), KernelWindow(2, 1), context); + Comparator::compare_and_swap(KernelWindow(3, 2), KernelWindow(0, 2), context); + Comparator::compare_and_swap(KernelWindow(4, 2), KernelWindow(1, 2), context); + Comparator::compare_and_swap(KernelWindow(2, 2), KernelWindow(0, 2), context); + Comparator::compare_and_swap(KernelWindow(3, 2), KernelWindow(1, 2), context); + Comparator::compare_and_swap(KernelWindow(1, 2), KernelWindow(0, 2), context); + Comparator::compare_and_swap(KernelWindow(4, 2), KernelWindow(2, 2), context); + Comparator::compare_and_swap(KernelWindow(2, 2), KernelWindow(1, 2), context); + Comparator::compare_and_swap(KernelWindow(4, 2), KernelWindow(3, 2), context); + Comparator::compare_and_swap(KernelWindow(3, 2), KernelWindow(2, 2), context); + Comparator::compare_and_swap(KernelWindow(3, 3), KernelWindow(0, 3), context); + Comparator::compare_and_swap(KernelWindow(4, 3), KernelWindow(1, 3), context); + Comparator::compare_and_swap(KernelWindow(2, 3), KernelWindow(0, 3), context); + Comparator::compare_and_swap(KernelWindow(3, 3), KernelWindow(1, 3), context); + Comparator::compare_and_swap(KernelWindow(1, 3), KernelWindow(0, 3), context); + Comparator::compare_and_swap(KernelWindow(4, 3), KernelWindow(2, 3), context); + Comparator::compare_and_swap(KernelWindow(2, 3), KernelWindow(1, 3), context); + Comparator::compare_and_swap(KernelWindow(4, 3), KernelWindow(3, 3), context); + Comparator::compare_and_swap(KernelWindow(3, 3), KernelWindow(2, 3), context); + Comparator::compare_and_swap(KernelWindow(3, 4), KernelWindow(0, 4), context); + Comparator::compare_and_swap(KernelWindow(4, 4), KernelWindow(1, 4), context); + Comparator::compare_and_swap(KernelWindow(2, 4), KernelWindow(0, 4), context); + Comparator::compare_and_swap(KernelWindow(3, 4), KernelWindow(1, 4), context); + Comparator::compare_and_swap(KernelWindow(1, 4), KernelWindow(0, 4), context); + Comparator::compare_and_swap(KernelWindow(4, 4), KernelWindow(2, 4), context); + Comparator::compare_and_swap(KernelWindow(2, 4), KernelWindow(1, 4), context); + Comparator::compare_and_swap(KernelWindow(4, 4), KernelWindow(3, 4), context); + Comparator::compare_and_swap(KernelWindow(3, 4), KernelWindow(2, 4), context); + // sort row zero for only element 3 and 4 + Comparator::compare_and_swap(KernelWindow(0, 3), KernelWindow(0, 0), context); + Comparator::compare_and_swap(KernelWindow(0, 4), KernelWindow(0, 1), context); + Comparator::compare_and_swap(KernelWindow(0, 2), KernelWindow(0, 0), context); + Comparator::compare_and_swap(KernelWindow(0, 3), KernelWindow(0, 1), context); + Comparator::min(KernelWindow(0, 1), KernelWindow(0, 0), context); + Comparator::compare_and_swap(KernelWindow(0, 4), KernelWindow(0, 2), context); + Comparator::min(KernelWindow(0, 2), KernelWindow(0, 1), context); + Comparator::compare_and_swap(KernelWindow(0, 4), KernelWindow(0, 3), context); + Comparator::min(KernelWindow(0, 3), KernelWindow(0, 2), context); + // sort row 1 for only element {2, 3, 4} + Comparator::compare_and_swap(KernelWindow(1, 3), KernelWindow(1, 0), context); + Comparator::compare_and_swap(KernelWindow(1, 4), KernelWindow(1, 1), context); + Comparator::compare_and_swap(KernelWindow(1, 2), KernelWindow(1, 0), context); + Comparator::compare_and_swap(KernelWindow(1, 3), KernelWindow(1, 1), context); + Comparator::min(KernelWindow(1, 1), KernelWindow(1, 0), context); + Comparator::compare_and_swap(KernelWindow(1, 4), KernelWindow(1, 2), context); + Comparator::min(KernelWindow(1, 2), KernelWindow(1, 1), context); + Comparator::compare_and_swap(KernelWindow(1, 4), KernelWindow(1, 3), context); + Comparator::compare_and_swap(KernelWindow(1, 3), KernelWindow(1, 2), context); + // sort row 2 {1, 2, 3} + Comparator::compare_and_swap(KernelWindow(2, 3), KernelWindow(2, 0), context); + Comparator::compare_and_swap(KernelWindow(2, 4), KernelWindow(2, 1), context); + Comparator::compare_and_swap(KernelWindow(2, 2), KernelWindow(2, 0), context); + Comparator::compare_and_swap(KernelWindow(2, 3), KernelWindow(2, 1), context); + Comparator::min(KernelWindow(2, 1), KernelWindow(2, 0), context); + Comparator::compare_and_swap(KernelWindow(2, 4), KernelWindow(2, 2), context); + Comparator::compare_and_swap(KernelWindow(2, 2), KernelWindow(2, 1), context); + Comparator::max(KernelWindow(2, 4), KernelWindow(2, 3), context); + Comparator::compare_and_swap(KernelWindow(2, 3), KernelWindow(2, 2), context); + // sort row 3 + Comparator::compare_and_swap(KernelWindow(3, 3), KernelWindow(3, 0), context); + Comparator::compare_and_swap(KernelWindow(3, 4), KernelWindow(3, 1), context); + Comparator::compare_and_swap(KernelWindow(3, 2), KernelWindow(3, 0), context); + Comparator::compare_and_swap(KernelWindow(3, 3), KernelWindow(3, 1), context); + Comparator::compare_and_swap(KernelWindow(3, 1), KernelWindow(3, 0), context); + Comparator::compare_and_swap(KernelWindow(3, 4), KernelWindow(3, 2), context); + Comparator::compare_and_swap(KernelWindow(3, 2), KernelWindow(3, 1), context); + Comparator::max(KernelWindow(3, 4), KernelWindow(3, 3), context); + Comparator::max(KernelWindow(3, 3), KernelWindow(3, 2), context); + // sort row 4 + Comparator::compare_and_swap(KernelWindow(4, 3), KernelWindow(4, 0), context); + Comparator::compare_and_swap(KernelWindow(4, 4), KernelWindow(4, 1), context); + Comparator::compare_and_swap(KernelWindow(4, 2), KernelWindow(4, 0), context); + Comparator::max(KernelWindow(4, 3), KernelWindow(4, 1), context); + Comparator::compare_and_swap(KernelWindow(4, 1), KernelWindow(4, 0), context); + Comparator::max(KernelWindow(4, 4), KernelWindow(4, 2), context); + Comparator::max(KernelWindow(4, 2), KernelWindow(4, 1), context); + // sort dig 0 + Comparator::min(KernelWindow(2, 1), KernelWindow(0, 3), context); + Comparator::min(KernelWindow(3, 0), KernelWindow(1, 2), context); + Comparator::min(KernelWindow(3, 0), KernelWindow(2, 1), context); + // sort dig 1 + Comparator::compare_and_swap(KernelWindow(3, 1), KernelWindow(0, 4), context); + Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(1, 3), context); + Comparator::compare_and_swap(KernelWindow(2, 2), KernelWindow(0, 4), context); + Comparator::compare_and_swap(KernelWindow(3, 1), KernelWindow(1, 3), context); + Comparator::min(KernelWindow(1, 3), KernelWindow(0, 4), context); + Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(2, 2), context); + Comparator::min(KernelWindow(2, 2), KernelWindow(1, 3), context); + Comparator::max(KernelWindow(4, 0), KernelWindow(3, 1), context); + Comparator::max(KernelWindow(3, 1), KernelWindow(2, 2), context); + // sort dig 2 + Comparator::max(KernelWindow(3, 2), KernelWindow(1, 4), context); + Comparator::max(KernelWindow(4, 1), KernelWindow(2, 3), context); + Comparator::max(KernelWindow(2, 3), KernelWindow(1, 4), context); + Comparator::compare_and_swap(KernelWindow(3, 0), KernelWindow(1, 4), context); + Comparator::min(KernelWindow(2, 2), KernelWindow(1, 4), context); + Comparator::max(KernelWindow(3, 0), KernelWindow(2, 2), context); + output_vec = KernelWindow(2, 2); +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_MEDIAN_BLUR_SORTING_NETWORK_5X5_H diff --git a/kleidicv/src/filters/median_blur_sve2.cpp b/kleidicv/src/filters/median_blur_sve2.cpp new file mode 100644 index 000000000..4e40ece94 --- /dev/null +++ b/kleidicv/src/filters/median_blur_sve2.cpp @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/filters/median_blur.h" +#include "median_blur_sc.h" + +namespace kleidicv::sve2 { + +template +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_stripe( + const T* src, size_t src_stride, T* dst, size_t dst_stride, size_t width, + size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { + return median_blur_stripe_sc(src, src_stride, dst, dst_stride, width, height, + y_begin, y_end, channels, kernel_width, + kernel_height, border_type); +} + +#define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_stripe( \ + const type* src, size_t src_stride, type* dst, size_t dst_stride, \ + size_t width, size_t height, size_t y_begin, size_t y_end, \ + size_t channels, size_t kernel_width, size_t kernel_height, \ + FixedBorderType border_type) + +KLEIDICV_INSTANTIATE_TEMPLATE(int8_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int16_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint16_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int32_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint32_t); +KLEIDICV_INSTANTIATE_TEMPLATE(float); + +} // namespace kleidicv::sve2 -- GitLab