From 1d67199e85e3fcef70218c112350f60b75b228e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 26 Jul 2024 17:10:26 +0200 Subject: [PATCH] Support arbitrary kernel sizes in Separable Filter 2D --- .../include/kleidicv/filter_driver_neon.h | 118 +++++++++++ kleidicv/include/kleidicv/workspace/border.h | 185 ++++++++++++++++++ .../include/kleidicv/workspace/separable.h | 79 +++++++- .../src/filters/separable_filter_2d_neon.cpp | 179 ++++++++++++++--- test/api/test_separable_filter_2d.cpp | 42 ++++ 5 files changed, 579 insertions(+), 24 deletions(-) diff --git a/kleidicv/include/kleidicv/filter_driver_neon.h b/kleidicv/include/kleidicv/filter_driver_neon.h index daacde7af..dcf960223 100644 --- a/kleidicv/include/kleidicv/filter_driver_neon.h +++ b/kleidicv/include/kleidicv/filter_driver_neon.h @@ -12,6 +12,13 @@ namespace kleidicv::neon { // Template for drivers of separable NxM filters. +template +class SeparableFilterDriver; + +// Template for drivers of separable NxM filters with arbitrary kernel sizes. +template +class SeparableFilterDriverArbitrary; + template class SeparableFilterDriver { public: @@ -135,6 +142,117 @@ class SeparableFilterDriver { FilterType filter_; }; // end of class SeparableFilterDriver +template +class SeparableFilterDriverArbitrary { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = typename neon::DynamicBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilterDriverArbitrary(FilterType &filter, + size_t kernel_size) + : margin(kernel_size >> 1), filter_{filter}, kernel_size_(kernel_size) {} + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(SourceVectorType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = vld1q(&src_rows.at(border_offsets.c(i))[index]); + } + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(SourceType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = src_rows.at(border_offsets.c(i))[index]; + } + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + BufferVectorType *src_a = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferVectorType))); + BufferVectorType *src_b = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferVectorType))); + + for (size_t i = 0; i < kernel_size_; i++) { + src_a[i] = vld1q(&src_rows.at(0, border_offsets.c(i))[index]); + } + + for (size_t i = 0; i < kernel_size_; i++) { + src_b[i] = vld1q(&src_rows.at( + 0, border_offsets.c(i))[index + BufferVecTraits::num_lanes()]); + } + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferVectorType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = vld1q(&src_rows.at(0, border_offsets.c(i))[index]); + } + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + const size_t margin; + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets, + size_t index) const { + BufferType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = src_rows.at(0, border_offsets.c(i))[index]; + } + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; + const size_t kernel_size_; +}; // end of class SeparableFilterDriverArbitrary + } // namespace kleidicv::neon #endif // KLEIDICV_FILTER_DRIVER_NEON_H diff --git a/kleidicv/include/kleidicv/workspace/border.h b/kleidicv/include/kleidicv/workspace/border.h index 3e5a8d34c..ddc149b0a 100644 --- a/kleidicv/include/kleidicv/workspace/border.h +++ b/kleidicv/include/kleidicv/workspace/border.h @@ -14,6 +14,10 @@ namespace KLEIDICV_TARGET_NAMESPACE { template class FixedBorderInfo; +// Border offsets for dynamically-sized filters. +template +class DynamicBorderInfo; + template class FixedBorderInfo final { public: @@ -202,6 +206,187 @@ class FixedBorderInfo final { FixedBorderType border_type_; }; // end of class FixedBorderInfo +template +class DynamicBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + Offsets() = delete; + Offsets(const Offsets&) = delete; + Offsets& operator=(const Offsets&) = delete; + Offsets(Offsets&& other) noexcept : offsets_(other.offsets_) { + other.offsets_ = nullptr; + } + + explicit Offsets(size_t kernel_size) : offsets_(new size_t[kernel_size]) {} + ~Offsets() { delete[] offsets_; } + + size_t c(int i) const { return offsets_ ? offsets_[i] : 0; } + + private: + friend class DynamicBorderInfo; + size_t* offsets_; + }; + + DynamicBorderInfo(size_t height, FixedBorderType border_type, + size_t kernel_size) + : height_(height), + border_type_(border_type), + kernel_size_(kernel_size), + half_kernel_size_(static_cast(kernel_size >> 1)) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get_no_border(); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + return get_border(column_index); + break; + + case FixedBorderType::REFLECT: + return get_border(column_index); + break; + + case FixedBorderType::WRAP: + return get_border(column_index); + break; + + case FixedBorderType::REVERSE: + return get_border(column_index); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{kernel_size_}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + column_index = height_ - column_index - 1; + switch (border_type_) { + case FixedBorderType::REPLICATE: + return get_border(column_index); + break; + + case FixedBorderType::REFLECT: + return get_border(column_index); + break; + + case FixedBorderType::WRAP: + return get_border(column_index); + break; + + case FixedBorderType::REVERSE: + return get_border(column_index); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{kernel_size_}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index < static_cast(half_kernel_size_)) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= + (height_ - static_cast(half_kernel_size_))) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Creates the Offsets object containing offsets in the interval + // [-(KernelSize / 2), KernelSize / 2]. + inline Offsets get_no_border() const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets{kernel_size_}; + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[i] = i - half_kernel_size_; + } + offsets.offsets_[half_kernel_size_] = 0; + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[half_kernel_size_ + 1 + i] = i + 1; + } + return offsets; + } + + // Creates the Offsets object containing offsets in various intervals + // depending on the column, border type as well the border position used. + // + // For examples, refer to the static implementation in class FixedBorderInfo. + // NOLINTBEGIN(readability-function-cognitive-complexity) + template + inline Offsets get_border(int column) const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets{kernel_size_}; + + if constexpr (!IsRight) { + for (int i = 0; i < half_kernel_size_; i++) { + if (i - half_kernel_size_ < -column) { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + offsets.offsets_[i] = -column; + } else if constexpr (BorderType == FixedBorderType::REFLECT) { + offsets.offsets_[i] = half_kernel_size_ - (column << 1) - (i + 1); + } else if constexpr (BorderType == FixedBorderType::WRAP) { + offsets.offsets_[i] = i - half_kernel_size_ + height_; + } else if constexpr (BorderType == FixedBorderType::REVERSE) { + offsets.offsets_[i] = half_kernel_size_ - (column << 1) - i; + } + continue; + } + offsets.offsets_[i] = i - half_kernel_size_; + } + offsets.offsets_[half_kernel_size_] = 0; + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[half_kernel_size_ + 1 + i] = i + 1; + } + } + + if constexpr (IsRight) { + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[i] = i - half_kernel_size_; + } + offsets.offsets_[half_kernel_size_] = 0; + for (int i = 0; i < half_kernel_size_; i++) { + if (i >= column) { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + offsets.offsets_[half_kernel_size_ + 1 + i] = column; + } else if constexpr (BorderType == FixedBorderType::REFLECT) { + offsets.offsets_[half_kernel_size_ + 1 + i] = (column << 1) - i; + } else if constexpr (BorderType == FixedBorderType::WRAP) { + offsets.offsets_[half_kernel_size_ + 1 + i] = i - height_ + 1; + } else if constexpr (BorderType == FixedBorderType::REVERSE) { + offsets.offsets_[half_kernel_size_ + 1 + i] = + (column << 1) - (i + 1); + } + continue; + } + offsets.offsets_[half_kernel_size_ + 1 + i] = i + 1; + } + } + + return offsets; + } + // NOLINTEND(readability-function-cognitive-complexity) + + size_t height_; + FixedBorderType border_type_; + + size_t kernel_size_; + int half_kernel_size_; +}; // end of class DynamicBorderInfo + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_WORKSPACE_BORDER_H diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 7a341d51e..27a394d75 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -16,6 +16,7 @@ namespace KLEIDICV_TARGET_NAMESPACE { // Forward declarations. class SeparableFilterWorkspace; +class SeparableFilterWorkspaceDynamic; // Deleter for SeparableFilterWorkspace instances. class SeparableFilterWorkspaceDeleter { @@ -67,7 +68,7 @@ class SeparableFilterWorkspaceDeleter { // // Handling of borders is calculated based on offsets rather than setting up // suitably-sized buffers which could hold both borders and data. -class SeparableFilterWorkspace final { +class SeparableFilterWorkspace { public: // To avoid load/store penalties. static constexpr size_t kAlignment = 16UL; @@ -195,6 +196,7 @@ class SeparableFilterWorkspace final { } } + protected: // Offset in bytes to the buffer rows from &data_[0]. size_t buffer_rows_offset_; // Stride of the buffer rows. @@ -208,6 +210,81 @@ class SeparableFilterWorkspace final { uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment); }; // end of class SeparableFilterWorkspace +class SeparableFilterWorkspaceDynamic : SeparableFilterWorkspace { + public: + // Processes rows vertically first along the full width + template + void process(Rectangle rect, + Rows src_rows, + Rows dst_rows, + size_t channels, typename FilterType::BorderType border_type, + size_t kernel_size, + FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + // Border helper which calculates border offsets. + typename FilterType::BorderInfoType vertical_border{ + rect.height(), border_type, kernel_size}; + typename FilterType::BorderInfoType horizontal_border{ + rect.width(), border_type, kernel_size}; + + // Buffer rows which hold intermediate widened data. + auto buffer_rows = Rows{reinterpret_cast( + &data_[buffer_rows_offset_]), + buffer_rows_stride_, channels}; + + // Vertical processing loop. + for (size_t vertical_index = 0; vertical_index < rect.height(); + ++vertical_index) { + // Recalculate vertical border offsets. + auto offsets = vertical_border.offsets_with_border(vertical_index); + // Process in the vertical direction first. + filter.process_vertical(rect.width(), src_rows.at(vertical_index), + buffer_rows, offsets); + // Process in the horizontal direction last. + process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), + filter, horizontal_border); + } + } + + template + void process_horizontal(size_t width, + Rows buffer_rows, + Rows dst_rows, + FilterType filter, + typename FilterType::BorderInfoType horizontal_border) + KLEIDICV_STREAMING_COMPATIBLE { + // Margin associated with the filter. + size_t margin = filter.margin; + + // Process data affected by left border. + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + auto offsets = + horizontal_border.offsets_with_left_border(horizontal_index); + filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), + dst_rows.at(0, horizontal_index), + offsets); + } + + // Process data which is not affected by any borders in bulk. + { + size_t width_without_borders = width - (2 * margin); + auto offsets = horizontal_border.offsets_without_border(); + filter.process_horizontal(width_without_borders, + buffer_rows.at(0, margin), + dst_rows.at(0, margin), offsets); + } + + // Process data affected by right border. + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + size_t index = width - margin + horizontal_index; + auto offsets = horizontal_border.offsets_with_right_border(index); + filter.process_horizontal_borders(buffer_rows.at(0, index), + dst_rows.at(0, index), offsets); + } + } +}; // end of class SeparableFilterWorkspaceDynamic + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_WORKSPACE_SEPARABLE_H diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index be40208aa..9f2a66123 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -15,8 +15,11 @@ namespace kleidicv::neon { template class SeparableFilter2D; -template <> -class SeparableFilter2D { +template +class SeparableFilter2DArbitrary; + +template +class SeparableFilter2D { public: using SourceType = uint8_t; using BufferType = uint8_t; @@ -25,25 +28,27 @@ class SeparableFilter2D { explicit SeparableFilter2D(const uint8_t *kernel_x, const uint8_t *kernel_y) : kernel_x_(kernel_x), kernel_y_(kernel_y) {} - void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const { + void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { this->vector_path_with_kernel(src, dst, kernel_y_); } - void vertical_scalar_path(const SourceType src[5], BufferType *dst) const { + void vertical_scalar_path(const SourceType src[KernelSize], + BufferType *dst) const { this->scalar_path_with_kernel(src, dst, kernel_y_); } - void horizontal_vector_path(uint8x16_t src[5], DestinationType *dst) const { + void horizontal_vector_path(uint8x16_t src[KernelSize], + DestinationType *dst) const { this->vector_path_with_kernel(src, dst, kernel_x_); } - void horizontal_scalar_path(const BufferType src[5], + void horizontal_scalar_path(const BufferType src[KernelSize], DestinationType *dst) const { this->scalar_path_with_kernel(src, dst, kernel_x_); } private: - void vector_path_with_kernel(uint8x16_t src[5], uint8_t *dst, + void vector_path_with_kernel(uint8x16_t src[KernelSize], uint8_t *dst, const uint8_t *kernel) const { uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); @@ -53,7 +58,7 @@ class SeparableFilter2D { // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 1; i < 5; i++) { + for (size_t i = 1; i < KernelSize; i++) { uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); @@ -67,7 +72,7 @@ class SeparableFilter2D { vst1q_u8(&dst[0], result); } - void scalar_path_with_kernel(const uint8_t src[5], uint8_t *dst, + void scalar_path_with_kernel(const uint8_t src[KernelSize], uint8_t *dst, const uint8_t *kernel) const { uint8_t acc; // NOLINT if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { @@ -75,7 +80,7 @@ class SeparableFilter2D { return; } - for (size_t i = 1; i < 5; i++) { + for (size_t i = 1; i < KernelSize; i++) { uint8_t temp; // NOLINT if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { dst[0] = std::numeric_limits::max(); @@ -92,7 +97,143 @@ class SeparableFilter2D { const uint8_t *kernel_x_; const uint8_t *kernel_y_; -}; +}; // end of class SeparableFilter2D + +template <> +class SeparableFilter2DArbitrary { + public: + using SourceType = uint8_t; + using BufferType = uint8_t; + using DestinationType = uint8_t; + + explicit SeparableFilter2DArbitrary(const uint8_t *kernel_x, + const uint8_t *kernel_y, + const size_t kernel_size) + : kernel_x_(kernel_x), kernel_y_(kernel_y), kernel_size_(kernel_size) {} + + void vertical_vector_path(uint8x16_t src[], BufferType *dst) const { + this->vector_path_with_kernel(src, dst, kernel_y_); + } + + void vertical_scalar_path(const SourceType src[], BufferType *dst) const { + this->scalar_path_with_kernel(src, dst, kernel_y_); + } + + void horizontal_vector_path(uint8x16_t src[], DestinationType *dst) const { + this->vector_path_with_kernel(src, dst, kernel_x_); + } + + void horizontal_scalar_path(const BufferType src[], + DestinationType *dst) const { + this->scalar_path_with_kernel(src, dst, kernel_x_); + } + + private: + void vector_path_with_kernel(uint8x16_t src[], uint8_t *dst, + const uint8_t *kernel) const { + uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); + uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); + + acc_l = vmulq_n_u16(acc_l, kernel[0]); + acc_h = vmulq_n_u16(acc_h, kernel[0]); + + for (size_t i = 1; i < kernel_size_; i++) { + uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); + uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); + + acc_l = vmlaq_n_u16(acc_l, vec_l, kernel[i]); + acc_h = vmlaq_n_u16(acc_h, vec_h, kernel[i]); + } + + uint8x8_t result_l = vqmovn_u16(acc_l); + uint8x16_t result = vqmovn_high_u16(result_l, acc_h); + + vst1q_u8(&dst[0], result); + } + + void scalar_path_with_kernel(const uint8_t src[], uint8_t *dst, + const uint8_t *kernel) const { + uint8_t acc; // NOLINT + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { + dst[0] = std::numeric_limits::max(); + return; + } + + for (size_t i = 1; i < kernel_size_; i++) { + uint8_t temp; // NOLINT + if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { + dst[0] = std::numeric_limits::max(); + return; + } + if (__builtin_add_overflow(acc, temp, &acc)) { + dst[0] = std::numeric_limits::max(); + return; + } + } + + dst[0] = acc; + } + + const uint8_t *kernel_x_; + const uint8_t *kernel_y_; + const size_t kernel_size_; +}; // end of class SeparableFilter2DArbitrary + +template +static kleidicv_error_t separable_filter_2d_fixed_kernel_size( + Rectangle &rect, Rows &src_rows, + Rows &dst_rows, const ScalarType *kernel_x, + const ScalarType *kernel_y, size_t channels, FixedBorderType border_type, + SeparableFilterWorkspace *workspace) { + using SeparableFilterClass = SeparableFilter2D; + + SeparableFilterClass filterClass{kernel_x, kernel_y}; + SeparableFilterDriver filter{filterClass}; + workspace->process(rect, src_rows, dst_rows, channels, border_type, filter); + return KLEIDICV_OK; +} + +template +static kleidicv_error_t separable_filter_2d( + const ScalarType *src, size_t src_stride, ScalarType *dst, + size_t dst_stride, Rectangle &rect, size_t channels, + const ScalarType *kernel_x, const ScalarType *kernel_y, size_t kernel_size, + FixedBorderType border_type, SeparableFilterWorkspace *workspace) { + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + + switch (kernel_size) { + case 3: + return separable_filter_2d_fixed_kernel_size<3>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + case 5: + return separable_filter_2d_fixed_kernel_size<5>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + case 7: + return separable_filter_2d_fixed_kernel_size<7>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + case 15: + return separable_filter_2d_fixed_kernel_size<15>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + default: + break; + } + + using SeparableFilterClass = SeparableFilter2DArbitrary; + + SeparableFilterClass filterClass{kernel_x, kernel_y, kernel_size}; + SeparableFilterDriverArbitrary filter{filterClass, + kernel_size}; + reinterpret_cast(workspace)->process( + rect, src_rows, dst_rows, channels, border_type, kernel_size, filter); + + return KLEIDICV_OK; +} KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t separable_filter_2d_u8( @@ -104,7 +245,7 @@ kleidicv_error_t separable_filter_2d_u8( auto *workspace = reinterpret_cast(context); auto fixed_border_type = get_fixed_border_type(border_type); - if (kernel_width != 5 || kernel_height != 5) { + if (kernel_width != kernel_height) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } @@ -136,17 +277,9 @@ kleidicv_error_t separable_filter_2d_u8( return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - using SeparableFilterClass = SeparableFilter2D; - - SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilterDriver filter{filterClass}; - - Rows src_rows{src, src_stride, channels}; - Rows dst_rows{dst, dst_stride, channels}; - workspace->process(rect, src_rows, dst_rows, channels, *fixed_border_type, - filter); - - return KLEIDICV_OK; + return separable_filter_2d(src, src_stride, dst, dst_stride, rect, channels, + kernel_x, kernel_y, kernel_width, + *fixed_border_type, workspace); } } // namespace kleidicv::neon diff --git a/test/api/test_separable_filter_2d.cpp b/test/api/test_separable_filter_2d.cpp index 4d440b08c..7ffbfa9df 100644 --- a/test/api/test_separable_filter_2d.cpp +++ b/test/api/test_separable_filter_2d.cpp @@ -193,6 +193,48 @@ TYPED_TEST(SeparableFilter2D, 5x5Overflow) { EXPECT_EQ_ARRAY2D(dst_expected, dst); } +TYPED_TEST(SeparableFilter2D, Arbitrary9x9) { + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 9, 9, 9, 9)); + test::Array2D src{9, 9, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 1, 2, 3, 4, 5, 6, 7, 8, 9}); + src.set(1, 0, { 2, 3, 4, 5, 6, 7, 8, 9, 1}); + src.set(2, 0, { 3, 4, 5, 6, 7, 8, 9, 1, 2}); + src.set(3, 0, { 4, 5, 6, 7, 8, 9, 1, 2, 3}); + src.set(4, 0, { 5, 6, 7, 8, 9, 1, 2, 3, 4}); + src.set(5, 0, { 6, 7, 8, 9, 1, 2, 3, 4, 5}); + src.set(6, 0, { 7, 8, 9, 1, 2, 3, 4, 5, 6}); + src.set(7, 0, { 8, 9, 1, 2, 3, 4, 5, 6, 7}); + src.set(8, 0, { 9, 1, 2, 3, 4, 5, 6, 7, 8}); + // clang-format on + + test::Array2D kernel{9, 1}; + kernel.set(0, 0, {1, 0, 1, 0, 1, 0, 1, 0, 1}); + + test::Array2D dst{9, 9, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 9, 9, 1, kernel.data(), 9, kernel.data(), 9, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + + test::Array2D dst_expected{9, 9, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 85, 91, 106, 108, 128, 130, 150, 147, 162}); + dst_expected.set(1, 0, { 91, 106, 103, 123, 116, 136, 129, 144, 132}); + dst_expected.set(2, 0, { 106, 103, 118, 111, 131, 124, 144, 132, 147}); + dst_expected.set(3, 0, { 108, 123, 111, 131, 115, 135, 119, 134, 122}); + dst_expected.set(4, 0, { 128, 116, 131, 115, 135, 119, 139, 127, 142}); + dst_expected.set(5, 0, { 130, 136, 124, 135, 119, 130, 114, 129, 117}); + dst_expected.set(6, 0, { 150, 129, 144, 119, 139, 114, 134, 122, 137}); + dst_expected.set(7, 0, { 147, 144, 132, 134, 127, 129, 122, 137, 134}); + dst_expected.set(8, 0, { 162, 132, 147, 122, 142, 117, 137, 134, 149}); + // clang-format on + EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + TYPED_TEST(SeparableFilter2D, NullPointer) { using KernelTestParams = SeparableFilter2DKernelTestParams; kleidicv_filter_context_t *context = nullptr; -- GitLab