diff --git a/kleidicv/CMakeLists.txt b/kleidicv/CMakeLists.txt index 070c9caa9e5224843d9ef7ce81f0d0baad41d56f..7d5a532406682c56dca90179f298281485dd0aed 100644 --- a/kleidicv/CMakeLists.txt +++ b/kleidicv/CMakeLists.txt @@ -68,6 +68,7 @@ option(KLEIDICV_EXPERIMENTAL_FEATURE_CANNY "Internal - Enable experimental Canny option(KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV "Internal - If turned ON Canny algorithm creates bit exact result compared to OpenCV's original implementation" ON) # Marked experimental while CI does not test SME2 version of saturating add. option(KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 "Internal - Enable SME2 version of saturating add" OFF) +option(KLEIDICV_ENABLE_MOPA_CONVOLUTION "If turned ON gaussian blur will use MOPA convolution algorithm") # Continuous load and store NEON instructions produce suboptimal code generation on GCC version <= 11, # and these instructions are not supported on GCC version <=8. diff --git a/kleidicv/include/kleidicv/config.h.in b/kleidicv/include/kleidicv/config.h.in index 4d71bf393a25d4d20e9a46ab8210da99ff3d713c..313fa76398b2d680be6137aed74c417eead7c38e 100644 --- a/kleidicv/include/kleidicv/config.h.in +++ b/kleidicv/include/kleidicv/config.h.in @@ -29,6 +29,8 @@ #cmakedefine01 KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 +#cmakedefine01 KLEIDICV_ENABLE_MOPA_CONVOLUTION + // Set to '1' if compiling NEON code paths, otherwise it is set to '0'. #ifndef KLEIDICV_TARGET_NEON #define KLEIDICV_TARGET_NEON 0 @@ -70,6 +72,10 @@ #define KLEIDICV_ASSUME_128BIT_SVE2 0 #define KLEIDICV_LOCALLY_STREAMING __arm_locally_streaming #define KLEIDICV_STREAMING __arm_streaming +#define KLEIDICV_STREAMING_COMPATIBLE __arm_streaming_compatible +#define KLEIDICV_INOUT_ZA __arm_inout("za") +#define KLEIDICV_NEW_ZA __arm_new("za") +#define KLEIDICV_PRESERVES_ZA __arm_preserves("za") #if KLEIDICV_TARGET_SME #define KLEIDICV_TARGET_FN_ATTRS KLEIDICV_ATTR_SECTION(".text.sme") @@ -115,4 +121,6 @@ #define KLEIDICV_NODISCARD #endif +#define KLEIDICV_RESTRICT __restrict__ + #endif // KLEIDICV_CONFIG_H diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index 5c0bbc2c48eb5b21a57079755d495dda7b066c27..8d288bd9cb9c91c3eeeed7cd44b37c48a641e3f1 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -146,6 +146,16 @@ kleidicv_error_t gaussian_blur_fixed_stripe_u8( } // namespace sme +namespace sme2 { + +kleidicv_error_t gaussian_blur_fixed_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, + FixedBorderType border_type, kleidicv_filter_context_t *context); + +} // namespace sme2 + } // namespace kleidicv #endif // KLEIDICV_FILTERS_GAUSSIAN_BLUR_H diff --git a/kleidicv/include/kleidicv/filters/matmul.h b/kleidicv/include/kleidicv/filters/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..e8983c262021e4653eb8398543b36e93eb4c173e --- /dev/null +++ b/kleidicv/include/kleidicv/filters/matmul.h @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_MATMUL_FILTER_GENERAL_H +#define KLEIDICV_MATMUL_FILTER_GENERAL_H + +#include +#include + +#include +#include +#include + +#include "kleidicv/config.h" +#include "kleidicv/types.h" +#include "kleidicv/workspace/border_15x15.h" +#include "kleidicv/workspace/border_types.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Filter class for matmul approach +template +class MatmulFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using BorderInfoType = typename FilterType::BorderInfoType; + using BorderType = typename FilterType::BorderType; + + static constexpr size_t kKernelSize = FilterType::kKernelSize; + static constexpr size_t kBorderSize = FilterType::kBorderSize; + static constexpr size_t kChannels = Channels; + + explicit MatmulFilter(FilterType filter) : filter_(filter) {} + + // Process rows horizontally and vertically + KLEIDICV_NEW_ZA void process( + Rows src_rows, + Rows transposed_buffer_rows, + Rows dst_rows, Rectangle rect, + Rectangle padded_rect, + typename FilterType::BorderInfoType horizontal_border, + typename FilterType::BorderInfoType vertical_border) KLEIDICV_STREAMING { + vertical_process(src_rows, dst_rows, rect, padded_rect, horizontal_border); + horizontal_process(dst_rows, transposed_buffer_rows, dst_rows, rect, + padded_rect, vertical_border); + } + + private: + void horizontal_process( + Rows src_rows, + Rows transpose_buffer_rows, + Rows dst_rows, Rectangle rect, + Rectangle padded_rect, typename FilterType::BorderInfoType border_info) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + typename FilterType::template IterationsInfo iterations_info; + const size_t col_iteration_step = iterations_info.horizontal_col_step(); + const size_t row_iteration_step = iterations_info.horizontal_row_step(); + + const size_t kernel_block_size = iterations_info.kernel_block_size(); + const size_t border_size = FilterType::kBorderSize; + const ssize_t kernel_block_border_padding = + rect.width() + border_size - kernel_block_size; + const size_t borderless_end = + kernel_block_border_padding < 0 ? 0 : kernel_block_border_padding; + + for (size_t row = 0; row < rect.height(); row += row_iteration_step) { + size_t batch = row_iteration_step; + // Regular branch instead of ternary operator + // to avoid csel. Relying on branch predictor + // since branch is predictable. + if (batch > padded_rect.height() - row) { // NOLINT + batch = padded_rect.height() - row; + } + + transposer_.transpose(src_rows, transpose_buffer_rows, rect, row, batch); + + for (size_t col = 0; col < border_size; col += col_iteration_step) { + filter_.template horizontal_path( + transpose_buffer_rows, dst_rows, rect, col, row, border_info); + } + + for (size_t col = border_size; col < borderless_end; + col += col_iteration_step) { + filter_.template horizontal_path( + transpose_buffer_rows, dst_rows, rect, col, row, border_info); + } + + for (size_t col = borderless_end; col < rect.width(); + col += col_iteration_step) { + filter_.template horizontal_path( + transpose_buffer_rows, dst_rows, rect, col, row, border_info); + } + } + } + + void vertical_process(Rows src_rows, + Rows dst_rows, + Rectangle rect, Rectangle padded_rect, + BorderInfoType border_info) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + typename FilterType::template IterationsInfo iterations_info; + const size_t col_iteration_step = iterations_info.vertical_col_step(); + const size_t row_iteration_step = iterations_info.vertical_row_step(); + const size_t elements_width = rect.width() * Channels; + + const size_t kernel_block_size = iterations_info.kernel_block_size(); + const size_t border_size = FilterType::kBorderSize; + const ssize_t kernel_block_border_padding = + rect.height() + border_size - kernel_block_size; + const size_t borderless_end = + kernel_block_border_padding < 0 ? 0 : kernel_block_border_padding; + + for (size_t col = 0; col < elements_width; col += col_iteration_step) { + for (size_t row = 0; row < border_size; row += row_iteration_step) { + filter_.template vertical_path( + src_rows, dst_rows, rect, padded_rect, col, row, border_info); + } + for (size_t row = border_size; row < borderless_end; + row += row_iteration_step) { + filter_.template vertical_path( + src_rows, dst_rows, rect, padded_rect, col, row, border_info); + } + for (size_t row = borderless_end; row < rect.height(); + row += row_iteration_step) { + filter_.template vertical_path( + src_rows, dst_rows, rect, padded_rect, col, row, border_info); + } + } + } + + FilterType filter_; + TransposerType transposer_; +}; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif diff --git a/kleidicv/include/kleidicv/filters/matmul_filter_checks.h b/kleidicv/include/kleidicv/filters/matmul_filter_checks.h new file mode 100644 index 0000000000000000000000000000000000000000..3d854882bedf6da3ee378a6edec4762046b6cad1 --- /dev/null +++ b/kleidicv/include/kleidicv/filters/matmul_filter_checks.h @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_MATMUL_FILTER_CHECKS_H +#define KLEIDICV_MATMUL_FILTER_CHECKS_H + +#include + +#include "kleidicv/config.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { +#if KLEIDICV_ENABLE_MOPA_CONVOLUTION +static constexpr size_t kMinKernelSize = 7; + +inline bool gaussian_blur_sme2_implementation_checks(size_t kernel_width, + size_t kernel_height, + size_t channels) { + return (kernel_height >= kMinKernelSize) && + (kernel_width >= kMinKernelSize) && (channels != 2); +} + +inline bool gaussian_blur_sme2_implementation_checks(size_t kernel_width, + size_t kernel_height) { + return (kernel_height >= kMinKernelSize) && (kernel_width >= kMinKernelSize); +} + +#else +inline bool gaussian_blur_sme2_implementation_checks(size_t, size_t, size_t) { + return false; +} + +inline bool gaussian_blur_sme2_implementation_checks(size_t, size_t) { + return false; +} +#endif +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif diff --git a/kleidicv/include/kleidicv/filters/sigma.h b/kleidicv/include/kleidicv/filters/sigma.h index 686c7197191feca8d440be33f78009c4c6b27b1e..46dff6cc4f8960109f670c05de3298467373aa74 100644 --- a/kleidicv/include/kleidicv/filters/sigma.h +++ b/kleidicv/include/kleidicv/filters/sigma.h @@ -5,9 +5,12 @@ #ifndef KLEIDICV_SIGMA_H #define KLEIDICV_SIGMA_H +#include +#include #include #include #include +#include #include "kleidicv/config.h" @@ -71,6 +74,26 @@ static void generate_gaussian_half_kernel(uint16_t* half_kernel, half_kernel[kMid] = static_cast(std::round(multiplier - error)); } +template +static std::array generate_gaussian_float_kernel( + float sigma) { + constexpr size_t half_kernel_size = KernelSize / 2 + 1; + std::array half_kernel{}; + generate_gaussian_half_kernel(half_kernel.data(), half_kernel_size, sigma); + + std::array kernel{}; + uint16_t max_value = std::numeric_limits::max(); + for (size_t i = 0; i <= KernelSize / 2; i++) { + kernel[i] = std::min(max_value, half_kernel[i]); + } + for (size_t i = KernelSize / 2 + 1; i < KernelSize; i++) { + kernel[i] = + std::min(max_value, half_kernel[KernelSize / 2 - (i - KernelSize / 2)]); + } + + return kernel; +} + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_SIGMA_H diff --git a/kleidicv/include/kleidicv/workspace/border.h b/kleidicv/include/kleidicv/workspace/border.h new file mode 100644 index 0000000000000000000000000000000000000000..727e0c1dcd815def160daa48c0703ab92a830736 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border.h @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_H +#define KLEIDICV_WORKSPACE_BORDER_H + +#include + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class BorderInfo { + public: + BorderInfo(size_t height, kleidicv::FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + size_t translate_index(ssize_t index) { + if (index < 0) { + switch (border_type_) { + case kleidicv::FixedBorderType::REPLICATE: + return 0; + case kleidicv::FixedBorderType::REFLECT: + return -index - 1; + case kleidicv::FixedBorderType::WRAP: + return height_ + index; + case kleidicv::FixedBorderType::REVERSE: + return -index; + } + } else if (static_cast(index) >= height_) { + switch (border_type_) { + case kleidicv::FixedBorderType::REPLICATE: + return height_ - 1; + case kleidicv::FixedBorderType::REFLECT: + return 2 * height_ - index - 1; + case kleidicv::FixedBorderType::WRAP: + return index - height_; + case kleidicv::FixedBorderType::REVERSE: + return 2 * height_ - index - 2; + } + } else { + return index; + } + } + + private: + static constexpr size_t kMargin = KernelSize / 2; + + size_t height_; + kleidicv::FixedBorderType border_type_; +}; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif diff --git a/kleidicv/include/kleidicv/workspace/matmul.h b/kleidicv/include/kleidicv/workspace/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..42c98e7e1e0228f2023f6691dbc82aa1792e97a4 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/matmul.h @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_MATMUL_H +#define KLEIDICV_WORKSPACE_MATMUL_H + +#include + +#include +#include +#include + +#include "kleidicv/types.h" +#include "kleidicv/utils.h" +#include "separable.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +class MatmulBufferSizesPolicy : public DefaultBufferSizesPolicy { + public: + MatmulBufferSizesPolicy(const Rectangle &rect, const Rectangle &kernel_rect, + size_t channels) + : DefaultBufferSizesPolicy(rect, channels), kernel_rect_(kernel_rect) {} + + size_t compute_buffer_size() const { + size_t max_border_width = kernel_rect_.width() >> 1; + Rectangle padded_rect(rect_.width() + (max_border_width << 1), + rect_.height() + (max_border_width << 1)); + + size_t matmul_allocation_size = (compute_helper_buffer_size(padded_rect) + + compute_kernel_buffer_size()); + return std::max(matmul_allocation_size, + DefaultBufferSizesPolicy::compute_buffer_size()); + } + + size_t compute_transpose_buffer_cols() const KLEIDICV_STREAMING_COMPATIBLE { + return svcntsb(); + } + + size_t compute_transpose_buffer_rows(Rectangle padded_rect, size_t channels, + size_t max_kernel_width) const + KLEIDICV_STREAMING_COMPATIBLE { + size_t rows = padded_rect.width() * channels; + + // To avoid using predicates and reminder loops while processing + // the buffer it should be aligned by SVLB (while building the buffer) + // and 4 (since it's being processed with 4 rows iteration because of the + // UMOPA) + size_t align1 = svcntsb(); + size_t align2 = max_kernel_width + svcntsw() - 1; + size_t lcm = (align2 >> __builtin_ctz(align2)) << __builtin_ctz(align1); + rows = align_up(rows, lcm); + return rows; + } + + size_t compute_helper_buffer_size(Rectangle padded_rect) const + KLEIDICV_STREAMING_COMPATIBLE { + return compute_transpose_buffer_cols() * + compute_transpose_buffer_rows(padded_rect, channels_, + kernel_rect_.width()); + } + + size_t compute_kernel_buffer_size() const KLEIDICV_STREAMING_COMPATIBLE { + // Same logic here with alignment: since UMOPA processing + // 4 rows at a time, alignment to 4 is needed + size_t rows = svcntsw() + kernel_rect_.width() - 1; + rows = align_up(rows, static_cast(4)); + + size_t cols = svcntsw(); + return cols * rows; + } + + private: + const Rectangle kernel_rect_; +}; + +// Workspace for separable fixed-size filters that uses matmul approach. +// +// Theory of operation remains the same as in the separable.h +// +// Operation will be done with 2 matrix multiplications: one corresponds to +// vertical path, second to horizontal path. First matrix is src_rows, second +// matrix is Toeplitz matrix built from kernel vectors data. This matrix is +// being constructed in the following way: +// +// M[i][j] = k_{i - j}, if i \in [j, j + K] +// M[i][j] = 0, otherwise +// +// where K stands for kernel size and k_i are kernel vector elements. In other +// words, kernel matrix is zero matrix with kernel values being placed on the +// diagonal strip with size K. +// +// Consequently, in matrix multiplication expression the following happens: +// +// K_v^T * M * K_h +// +class MatmulSeparableFilterWorkspace final : public SeparableFilterWorkspace { + public: + // Processes rows horizontally and vertically + template + void process(Rectangle rect, size_t y_begin, size_t y_end, + Rows src_rows, + Rows dst_rows, + typename FilterType::BorderType border_type, + FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + constexpr size_t kChannels = FilterType::kChannels; + constexpr size_t kKernelSize = FilterType::kKernelSize; + constexpr size_t kBorderSize = FilterType::kBorderSize; + static_assert(kChannels == 1 || kChannels == 3 || kChannels == 4); + + src_rows = src_rows.at(y_begin); + size_t height = y_end - y_begin; + + Rectangle corrected_rect{rect.width(), height}; + Rectangle padded_rect(rect.width() + (kBorderSize << 1), + rect.height() + (kBorderSize << 1)); + + MatmulBufferSizesPolicy matmul_policy( + corrected_rect, Rectangle(kKernelSize, kKernelSize), kChannels); + + typename FilterType::BorderInfoType vertical_border{corrected_rect.width(), + border_type}; + typename FilterType::BorderInfoType horizontal_border{ + corrected_rect.height(), border_type}; + + size_t kernel_buffer_size = matmul_policy.compute_kernel_buffer_size(); + auto *helper_buffer = reinterpret_cast( + &data_[buffer_rows_offset_ + kernel_buffer_size]); + helper_buffer = align_up(helper_buffer, kAlignment); + + // Channels is 1 due to the fact that data is transposed + auto transposed_buffer_rows = + Rows{helper_buffer, matmul_policy.compute_transpose_buffer_cols() * + sizeof(typename FilterType::SourceType)}; + filter.process(src_rows, transposed_buffer_rows, dst_rows, rect, + padded_rect, horizontal_border, vertical_border); + } + + // Get allocated chunk of memory for helper kernel buffer + void *get_kernel_buffer() { return &data_[buffer_rows_offset_]; } +}; // end of class SeparableFilterWorkspace + +static_assert(sizeof(MatmulSeparableFilterWorkspace) == + sizeof(SeparableFilterWorkspace)); + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_SEPARABLE_H diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index a9cb2f7aa55aa33eeb3d251214a443cc6a238200..9b45407e742ffc9339781cbbd116aca9c14de6d9 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -25,6 +25,30 @@ class SeparableFilterWorkspaceDeleter { }; }; +class DefaultBufferSizesPolicy { + public: + DefaultBufferSizesPolicy(const Rectangle &rect, size_t channels) + : rect_(rect), channels_(channels) {} + + size_t compute_buffer_rows_stride() const { + size_t buffer_rows_number_of_elements = rect_.width() * channels_; + // Adding more elements because of SVE, where interleaving stores are + // governed by one predicate. For example, if a predicate requires 7 uint8_t + // elements and an algorithm performs widening to 16 bits, the resulting + // interleaving store will still be governed by the same predicate, thus + // storing 8 elements. Choosing '3' to account for svst4(). + buffer_rows_number_of_elements += 3; + + return buffer_rows_number_of_elements; + } + + size_t compute_buffer_size() const { return compute_buffer_rows_stride(); } + + protected: + const Rectangle rect_; + const size_t channels_; +}; + // Workspace for separable fixed-size filters. // // Theory of operation @@ -76,47 +100,36 @@ class SeparableFilterWorkspace { using Pointer = std::unique_ptr; - // Workspace is only constructible with create(). - SeparableFilterWorkspace() = delete; - // Creates a workspace on the heap. - static Pointer create(Rectangle rect, size_t channels, - size_t intermediate_size) KLEIDICV_STREAMING { - size_t buffer_rows_number_of_elements = rect.width() * channels; - // Adding more elements because of SVE, where interleaving stores are - // governed by one predicate. For example, if a predicate requires 7 uint8_t - // elements and an algorithm performs widening to 16 bits, the resulting - // interleaving store will still be governed by the same predicate, thus - // storing 8 elements. Choosing '3' to account for svst4(). - buffer_rows_number_of_elements += 3; - - size_t buffer_rows_stride = - buffer_rows_number_of_elements * intermediate_size; - size_t buffer_rows_size = buffer_rows_stride; - buffer_rows_size += kAlignment - 1; + template + static Pointer create( + Rectangle rect, size_t channels, size_t intermediate_size, + const BufferAllocationPolicy &policy) KLEIDICV_STREAMING { + size_t buffer_rows_size = + (policy.compute_buffer_size() + kAlignment - 1) * intermediate_size; // Try to allocate workspace at once. size_t allocation_size = sizeof(SeparableFilterWorkspace) + buffer_rows_size; void *allocation = std::malloc(allocation_size); - auto workspace = SeparableFilterWorkspace::Pointer{ - reinterpret_cast(allocation)}; - - if (!workspace) { - return workspace; + if (!allocation) { + return SeparableFilterWorkspace::Pointer{ + reinterpret_cast(allocation)}; } - auto *buffer_rows_address = &workspace->data_[0]; - buffer_rows_address = align_up(buffer_rows_address, kAlignment); - workspace->buffer_rows_offset_ = buffer_rows_address - &workspace->data_[0]; - workspace->buffer_rows_stride_ = buffer_rows_stride; - workspace->image_size_ = rect; - workspace->channels_ = channels; - workspace->intermediate_size_ = intermediate_size; + auto workspace = SeparableFilterWorkspace::Pointer{ + new (allocation) SeparableFilterWorkspace(rect, channels, + intermediate_size, policy)}; return workspace; } + static Pointer create(Rectangle rect, size_t channels, + size_t intermediate_size) { + return create(rect, channels, intermediate_size, + DefaultBufferSizesPolicy{rect, channels}); + } + size_t channels() const { return channels_; } Rectangle image_size() const { return image_size_; } size_t intermediate_size() const { return intermediate_size_; } @@ -197,6 +210,19 @@ class SeparableFilterWorkspace { } protected: + // Workspace is only constructible with create(). + SeparableFilterWorkspace(const Rectangle &rect, size_t channels, + size_t intermediate_size, + const DefaultBufferSizesPolicy &policy) + : buffer_rows_stride_(policy.compute_buffer_rows_stride()), + image_size_(rect), + channels_(channels), + intermediate_size_(intermediate_size) { + auto *buffer_rows_address = &data_[0]; + buffer_rows_address = align_up(buffer_rows_address, kAlignment); + buffer_rows_offset_ = buffer_rows_address - &data_[0]; + } + template void process_horizontal(size_t width, Rows buffer_rows, diff --git a/kleidicv/include/kleidicv/workspace/workspace_factory.h b/kleidicv/include/kleidicv/workspace/workspace_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..af4e1eab63ad0d698ba70006824ded72ed0392ed --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/workspace_factory.h @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_WORKSPACE_FACTORY_H +#define KLEIDICV_WORKSPACE_WORKSPACE_FACTORY_H + +#include "kleidicv/dispatch.h" + +namespace kleidicv { + +namespace neon { +void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t max_kernel_width, + size_t max_kernel_height, size_t max_channels, size_t intermediate_size); + +void release_separable_filter_workspace(void *workspace); +} // namespace neon + +namespace sve2 { +void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t max_kernel_width, + size_t max_kernel_height, size_t max_channels, size_t intermediate_size); +void release_separable_filter_workspace(void *workspace); + +} // namespace sve2 + +namespace sme { +void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t max_kernel_width, + size_t max_kernel_height, size_t max_channels, size_t intermediate_size); +void release_separable_filter_workspace(void *workspace); + +} // namespace sme + +namespace sme2 { +void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t max_kernel_width, + size_t max_kernel_height, size_t max_channels, size_t intermediate_size); +void release_separable_filter_workspace(void *workspace); + +} // namespace sme2 + +} // namespace kleidicv + +#endif diff --git a/kleidicv/src/filters/gaussian_blur_api.cpp b/kleidicv/src/filters/gaussian_blur_api.cpp index 300d7d8ad6c8bddb0fe58eb813fcc64c01a67e17..2f3149ff86006e8bc95088f651172bc2b308dd6c 100644 --- a/kleidicv/src/filters/gaussian_blur_api.cpp +++ b/kleidicv/src/filters/gaussian_blur_api.cpp @@ -10,7 +10,8 @@ KLEIDICV_MULTIVERSION_C_API( kleidicv_gaussian_blur_fixed_stripe_u8, &kleidicv::neon::gaussian_blur_fixed_stripe_u8, KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_fixed_stripe_u8), - &kleidicv::sme::gaussian_blur_fixed_stripe_u8, nullptr); + &kleidicv::sme::gaussian_blur_fixed_stripe_u8, + &kleidicv::sme2::gaussian_blur_fixed_stripe_u8); KLEIDICV_MULTIVERSION_C_API(kleidicv_gaussian_blur_arbitrary_stripe_u8, &kleidicv::neon::gaussian_blur_arbitrary_stripe_u8, diff --git a/kleidicv/src/filters/gaussian_blur_fixed_sme2.cpp b/kleidicv/src/filters/gaussian_blur_fixed_sme2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e435368c1a53c1cc1629613c8e1d222a39bd5a28 --- /dev/null +++ b/kleidicv/src/filters/gaussian_blur_fixed_sme2.cpp @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gaussian_blur_fixed_sme2.h" + +#include "kleidicv/filters/gaussian_blur.h" + +namespace kleidicv::sme2 { + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +gaussian_blur_fixed_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, size_t width, + size_t height, size_t y_begin, size_t y_end, + size_t channels, size_t kernel_width, + size_t kernel_height, float sigma_x, + float sigma_y, FixedBorderType border_type, + kleidicv_filter_context_t *context) { + return gaussian_blur_fixed_stripe_u8_sme2( + src, src_stride, dst, dst_stride, width, height, y_begin, y_end, channels, + kernel_width, kernel_height, sigma_x, sigma_y, border_type, context); +} + +} // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_fixed_sme2.h b/kleidicv/src/filters/gaussian_blur_fixed_sme2.h new file mode 100644 index 0000000000000000000000000000000000000000..ac44d4524746b016913f28c48793e4b58d454338 --- /dev/null +++ b/kleidicv/src/filters/gaussian_blur_fixed_sme2.h @@ -0,0 +1,609 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef GAUSSIAN_BLUR_SME_H +#define GAUSSIAN_BLUR_SME_H + +#include +#include + +#include +#include +#include +#include + +#include "gaussian_blur_fixed_sc.h" +#include "kleidicv/config.h" +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/matmul.h" +#include "kleidicv/filters/matmul_filter_checks.h" +#include "kleidicv/filters/sigma.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/types.h" +#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_types.h" +#include "kleidicv/workspace/matmul.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class GaussianBlurMatmul { + public: + using SourceType = uint8_t; + using BufferType = uint8_t; + using DestinationType = uint8_t; + using BorderInfoType = BorderInfo; + using BorderType = FixedBorderType; + + static constexpr size_t kKernelSize = KernelSize; + static constexpr size_t kBorderSize = KernelSize / 2; + static constexpr size_t kKernelIterationStep = 4; + + // Class with iterations steps info for filter class + template + class IterationsInfo { + public: + IterationsInfo() KLEIDICV_STREAMING_COMPATIBLE = default; + size_t horizontal_col_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return svcntw(); + } + + size_t horizontal_row_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + // Data layout for 1 channel differs on transposition step + if constexpr (Channels == 4 || Channels == 3) { + return svcntw(); + } else if constexpr (Channels == 1) { + return svcntb(); + } else { + static_assert(false); + } + } + + size_t vertical_row_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return svcntw(); + } + + size_t vertical_col_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return svcntb(); + } + + ptrdiff_t kernel_block_size() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return GaussianBlurMatmul::kernel_block_size(); + } + + size_t kernel_iteration_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return kKernelIterationStep; + } + }; + + explicit GaussianBlurMatmul(float sigma, uint8_t *kernel_buffer) + : kernel(generate_gaussian_float_kernel(sigma)) { + build_kernel_helper_buffer(kernel_buffer); + kernel_rows = Rows{kernel_buffer, svcntsw()}; + } + + // Apply gaussian kernel's Toeplitz matrix horizontally and get + // [row_start, col_start]x[row_start + horizontal_row_step, col_start + + // horizontal_col_step] block of output matrix + template + void horizontal_path( + Rows transposed_rows, Rows dst_rows, + Rectangle rect, size_t col_start, size_t row_start, + BorderInfoType border_info) KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + const ptrdiff_t col = static_cast(col_start) - kBorderSize; + const size_t block_size = kernel_block_size(); + + for (size_t kernel_block_row = 0; kernel_block_row < block_size; + kernel_block_row += kKernelIterationStep) { + horizontal_fma_part( + kernel_block_row, col, transposed_rows, border_info); + } + + horizontal_store_part(dst_rows, rect, col_start, row_start); + svzero_za(); + } + + // Apply gaussian kernel's Toeplitz matrix vertically and get + // [row_start, col_start]x[row_start + horizontal_row_step, col_start + + // horizontal_col_step] block of output matrix + template + void vertical_path(Rows src, Rows dst, + Rectangle rect, Rectangle padded_rect, size_t col_start, + size_t row_start, BorderInfoType border_info) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + svbool_t pred_row = svwhilelt_b8(col_start, rect.width() * Channels); + const ptrdiff_t row = static_cast(row_start) - kBorderSize; + const ptrdiff_t col = static_cast(col_start); + const ptrdiff_t block_size = static_cast(kernel_block_size()); + const ptrdiff_t padded_height = + static_cast(padded_rect.height()); + + for (ptrdiff_t kernel_block_row = 0; kernel_block_row < block_size && + kernel_block_row + row < padded_height; + kernel_block_row += kKernelIterationStep) { + vertical_fma_part(kernel_block_row, row, col, + pred_row, src, border_info); + } + + vertical_store_part(dst, rect, col_start, row_start); + svzero_za(); + } + + private: + // To avoid iterating over whole kernel's matrix, iterations are + // done over non-zero blocks that covers diagonal strip of kernel + // values. Since vector length that covers kernel's matrix is + // SVLW, then amount of non-zero rows is SVLW + K - 1 + static constexpr size_t kernel_block_size() KLEIDICV_STREAMING_COMPATIBLE { + return svcntw() + kKernelSize - 1; + } + + // Build helper buffer in which kernel's Toeplitz matrix block pattern + // will be stored. Since UMOPA for uint8 is being used, this + // buffer will contain rows in the format (zipped) that UMOPA + // expects to avoid zipping vectors on processing stage + void build_kernel_helper_buffer(uint8_t *kernel_buffer) { + size_t svlw = svcntsw(); + size_t kernel_buffer_size = svlw + kKernelSize - 1; + kernel_buffer_size = (kernel_buffer_size + kKernelIterationStep - 1) / + kKernelIterationStep * kKernelIterationStep; + + size_t index = 0; + for (size_t row = 0; row < kernel_buffer_size; + row += kKernelIterationStep) { + size_t row_start = row; + for (size_t col = 0; col < svlw; col++) { + for (size_t r = 0; r < kKernelIterationStep; r++) { + size_t res_id = row_start - col + r; + uint8_t tmp = (res_id >= kKernelSize ? 0 : kernel[res_id]); + uint8_t v = res_id < 0 ? 0 : tmp; + kernel_buffer[index++] = v; + } + } + } + } + + template + void horizontal_fma_part(ptrdiff_t kernel_block_row, ptrdiff_t col, + Rows transposed_rows, + BorderInfoType border_info) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + // Kernel data is already prepared for use in build_kernel_helper_buffer. + // No additional zip is needed. + svuint8_t svkern = + svld1_u8(svptrue_b8(), &kernel_rows.at(kernel_block_row)[0]); + + const ptrdiff_t col0 = col + kernel_block_row + 0; + const ptrdiff_t col1 = col + kernel_block_row + 1; + const ptrdiff_t col2 = col + kernel_block_row + 2; + const ptrdiff_t col3 = col + kernel_block_row + 3; + svuint8_t svcol0, svcol1, svcol2, svcol3; + + if constexpr (EnableBorderTranslation) { + svcol0 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col0))[0]); + svcol1 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col1))[0]); + svcol2 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col2))[0]); + svcol3 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col3))[0]); + } else { + svcol0 = svld1_u8(svptrue_b8(), &transposed_rows.at(col0)[0]); + svcol1 = svld1_u8(svptrue_b8(), &transposed_rows.at(col1)[0]); + svcol2 = svld1_u8(svptrue_b8(), &transposed_rows.at(col2)[0]); + svcol3 = svld1_u8(svptrue_b8(), &transposed_rows.at(col3)[0]); + } + + svuint8x4_t svcols = svzip_u8_x4(svcreate4(svcol0, svcol1, svcol2, svcol3)); + + svmopa_za32_u8_m(0, svptrue_b8(), svptrue_b8(), svget4(svcols, 0), svkern); + svmopa_za32_u8_m(1, svptrue_b8(), svptrue_b8(), svget4(svcols, 1), svkern); + svmopa_za32_u8_m(2, svptrue_b8(), svptrue_b8(), svget4(svcols, 2), svkern); + svmopa_za32_u8_m(3, svptrue_b8(), svptrue_b8(), svget4(svcols, 3), svkern); + } + + template + void vertical_fma_part(ptrdiff_t kernel_block_row, ptrdiff_t row, + ptrdiff_t col, svbool_t pred_row, + Rows src, BorderInfoType border_info) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + // Kernel data is already prepared for use in build_kernel_helper_buffer. + // No additional zip is needed. + svuint8_t svkern = + svld1_u8(svptrue_b8(), &kernel_rows.at(kernel_block_row)[0]); + + const ptrdiff_t row0 = row + kernel_block_row + 0; + const ptrdiff_t row1 = row + kernel_block_row + 1; + const ptrdiff_t row2 = row + kernel_block_row + 2; + const ptrdiff_t row3 = row + kernel_block_row + 3; + + svuint8_t svcol0, svcol1, svcol2, svcol3; + if constexpr (EnableBorderTranslation) { + svcol0 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row0))[col]); + svcol1 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row1))[col]); + svcol2 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row2))[col]); + svcol3 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row3))[col]); + } else { + svcol0 = svld1_u8(pred_row, &src.at(row0)[col]); + svcol1 = svld1_u8(pred_row, &src.at(row1)[col]); + svcol2 = svld1_u8(pred_row, &src.at(row2)[col]); + svcol3 = svld1_u8(pred_row, &src.at(row3)[col]); + } + + svuint8x4_t svcols = svzip_u8_x4(svcreate4(svcol0, svcol1, svcol2, svcol3)); + + svmopa_za32_u8_m(0, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 0)); + svmopa_za32_u8_m(1, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 1)); + svmopa_za32_u8_m(2, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 2)); + svmopa_za32_u8_m(3, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 3)); + } + + // Store approach depends on channels. 1 channel images processed by SVLB + // stripes, while 3/4 channels processed by SVLW stripes and need additional + // interleaved stores/zips to respect elements order. + template + void horizontal_store_part(Rows dst, Rectangle rect, size_t col, + size_t row_start) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA; + + template <> + void horizontal_store_part<1>(Rows dst, Rectangle rect, + size_t col, size_t row_start) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + constexpr size_t za32_tiles_count = 4; + horizontal_1_channel_store_all_za_tiles( + std::make_index_sequence{}, dst, rect, row_start, col, + svwhilelt_b32(col, rect.width())); + } + + template + void horizontal_1_channel_store_all_za_tiles( + std::index_sequence, Rows dst, Rectangle rect, + size_t row_start, size_t col, + svbool_t col_pred) KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + (horizontal_1_channel_store_single_za_tile(dst, rect, row_start, col, + col_pred), + ...); + } + + template + void horizontal_1_channel_store_single_za_tile( + Rows dst, Rectangle rect, size_t row_start, size_t col, + svbool_t col_pred) KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + for (size_t row = I * svcntw(); + row < (I + 1) * svcntw() && row_start + row < rect.height(); row++) { + svuint32_t res = postprocess_vector( + svread_hor_za32_u32_m(svundef_u32(), svptrue_b32(), I, row)); + auto *dst_row = &dst.at(static_cast(row_start + row), + static_cast(col))[0]; + svst1b_u32(col_pred, dst_row, res); + } + } + + template <> + void horizontal_store_part<3>(Rows dst, Rectangle rect, + size_t col, size_t row_start) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + constexpr size_t channels = 3; + svbool_t col_pred = svwhilelt_b8(channels * col, channels * rect.width()); + // Current implementation underutilizes tiles for 3 channels and + // last quarter of the result vector will always be empty. + svbool_t chs3_pred = + svwhilelt_b8(static_cast(0), channels * svcntw()); + svbool_t pred = svand_b_z(svptrue_b8(), col_pred, chs3_pred); + + // To interleave vectors in 3-channel way svtlb is used. + // To build result vector indices should be in the following format: + // 0, SVL/4, SVL/2, 1, SVL/4 + 1, SVL/2 + 1, ... + // <--0th pixel--> <---- 1st pixel ----> + // This corresponds to the sequence (i % 3) * SVL / 4 + i / 3 + svuint8_t indices = svindex_u8(0, 1); + // floor(2^8 / 3) + 1 == 86, used to divide by 3 without + // division. + svuint8_t indices_div3 = + svmulh_u8_z(svptrue_b8(), indices, svdup_n_u8(86U)); + svuint8_t indices_mod3 = + svsub_u8_z(svptrue_b8(), indices, + svmul_z(svptrue_b8(), svdup_u8(3), indices_div3)); + svuint8_t indices_final = svadd_z( + svptrue_b8(), indices_div3, + svrshr_n_u8_z(svptrue_b8(), + svmul_z(svptrue_b8(), svdup_u8(svcntb()), indices_mod3), + 2)); + + for (size_t row = 0; row < svcntw() && row_start + row < rect.height(); + row++) { + svuint32x4_t resu = + (svcreate4(postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 0, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 1, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 2, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 3, row)))); + + svuint8_t svres = svqcvt_u8(resu); + svres = svtbl(svres, indices_final); + svst1(pred, + &dst.at(static_cast(row_start + row), + static_cast(col))[0], + svres); + } + } + + template <> + void horizontal_store_part<4>(Rows dst, Rectangle rect, + size_t col, size_t row_start) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + constexpr size_t channels = 4; + svbool_t col_pred = svwhilelt_b8(channels * col, channels * rect.width()); + for (size_t row = 0; row < svcntw() && row_start + row < rect.height(); + row++) { + svuint32x4_t resu = + (svcreate4(postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 0, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 1, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 2, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 3, row)))); + + resu = svzip_u32_x4(resu); + svst1(col_pred, + &dst.at(static_cast(row_start + row), + static_cast(col))[0], + svqcvt_u8(resu)); + } + } + + template + void vertical_store_part(Rows dst, Rectangle rect, + size_t col_start, size_t row_start) + KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + size_t svl = svcntw(); + svbool_t pred_row0 = + svwhilelt_b32(col_start + 0 * svcntw(), rect.width() * Channels); + svbool_t pred_row1 = + svwhilelt_b32(col_start + 1 * svcntw(), rect.width() * Channels); + svbool_t pred_row2 = + svwhilelt_b32(col_start + 2 * svcntw(), rect.width() * Channels); + svbool_t pred_row3 = + svwhilelt_b32(col_start + 3 * svcntw(), rect.width() * Channels); + + for (size_t row = 0; row < svl && (row_start + row) < rect.height(); + row++) { + svuint32x4_t resu = + svcreate4(postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 0, row)), + postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 1, row)), + postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 2, row)), + postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 3, row))); + + auto *dst_row = &dst.at(static_cast( + row_start + row))[static_cast(col_start)]; + + auto *KLEIDICV_RESTRICT dst_row0 = dst_row + 0 * svcntw(); + auto *KLEIDICV_RESTRICT dst_row1 = dst_row + 1 * svcntw(); + auto *KLEIDICV_RESTRICT dst_row2 = dst_row + 2 * svcntw(); + auto *KLEIDICV_RESTRICT dst_row3 = dst_row + 3 * svcntw(); + + svst1b_u32(pred_row0, dst_row0, svget4(resu, 0)); + svst1b_u32(pred_row1, dst_row1, svget4(resu, 1)); + svst1b_u32(pred_row2, dst_row2, svget4(resu, 2)); + svst1b_u32(pred_row3, dst_row3, svget4(resu, 3)); + } + } + + svuint32_t postprocess_vector_no_clamp(svuint32_t v) + KLEIDICV_STREAMING KLEIDICV_PRESERVES_ZA { + return svrshr_n_u32_x(svptrue_b32(), v, 8); + } + + svuint32_t postprocess_vector(svuint32_t v) + KLEIDICV_STREAMING KLEIDICV_PRESERVES_ZA { + return svclamp_u32(svdup_u32(0), svdup_u32(255), + postprocess_vector_no_clamp(v)); + } + + std::array kernel; + Rows kernel_rows; +}; + +// Class to transpose image data for horizontal processing +template +class Transposer { + public: + // Data layout depends on channels amount. + // + // 1) For 1 channel, row in the output buffer will correspond to a column + // part + // of an image for rows [row_start, row_start + SVLB]. + // + // 2) For 4 channel, row in the output buffer will correspond to a column + // part + // of an image for rows [row_start, row_start + SVLW], where first SVLW + // elements are 0th channel elements, second SVLW elements are + // corresponding 1th channel elements and so on. + // + // 3) For 3 channel the layout is the same as for 4 channel, however last + // SVLW elements are + // not used (consequently last tile won't be used for processing). + template + void transpose(Rows src_rows, + Rows transpose_buffer, Rectangle rect, + size_t row_start, + size_t rows) KLEIDICV_STREAMING KLEIDICV_INOUT_ZA { + const size_t svlb = svcntb(); + const size_t za_channel_padding = svlb >> 2; + + svzero_za(); + + for (size_t col = 0; col < rect.width(); col += svlb) { + svbool_t pred = svwhilelt_b8(col, rect.width()); + + for (size_t row = 0; row < rows; row++) { + auto *p = &src_rows.at(static_cast(row_start + row), + static_cast(col))[0]; + if constexpr (Channels == 4) { + svuint8x4_t c = svld4(pred, p); + svwrite_hor_za8_m(0, row + 0 * za_channel_padding, svptrue_b8(), + svget4(c, 0)); + svwrite_hor_za8_m(0, row + 1 * za_channel_padding, svptrue_b8(), + svget4(c, 1)); + svwrite_hor_za8_m(0, row + 2 * za_channel_padding, svptrue_b8(), + svget4(c, 2)); + svwrite_hor_za8_m(0, row + 3 * za_channel_padding, svptrue_b8(), + svget4(c, 3)); + } else if constexpr (Channels == 1) { + svuint8_t c = svld1(pred, p); + svwrite_hor_za8_m(0, row, svptrue_b8(), c); + } else if constexpr (Channels == 3) { + svuint8x3_t c = svld3(pred, p); + svwrite_hor_za8_m(0, row + 0 * za_channel_padding, svptrue_b8(), + svget3(c, 0)); + svwrite_hor_za8_m(0, row + 1 * za_channel_padding, svptrue_b8(), + svget3(c, 1)); + svwrite_hor_za8_m(0, row + 2 * za_channel_padding, svptrue_b8(), + svget3(c, 2)); + } else { + static_assert(false, "Unsupported amount of channels for transpose"); + } + } + + for (size_t i = 0; i < svlb; i++) { + svst1_ver_za8(0, i, svptrue_b8(), &transpose_buffer.at(col + i)[0]); + } + svzero_za(); + } + } + + private: +}; + +template +static kleidicv_error_t gaussian_blur( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, + size_t kernel_width, size_t, float sigma_x, float, + FixedBorderType fixed_border_type, + MatmulSeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + Rectangle rect{width, height}; + Rows src_rows{src, src_stride, Channels}; + Rows dst_rows{dst, dst_stride, Channels}; + uint8_t *kernel_buffer = + reinterpret_cast(workspace->get_kernel_buffer()); + + switch (kernel_width) { + case 7: { + using GaussianBlur = GaussianBlurMatmul<7>; + GaussianBlur inner_filter(sigma_x, kernel_buffer); + MatmulFilter> filter( + inner_filter); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, + fixed_border_type, filter); + break; + } + case 15: { + using GaussianBlur = GaussianBlurMatmul<15>; + GaussianBlur inner_filter(sigma_x, kernel_buffer); + MatmulFilter> filter( + inner_filter); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, + fixed_border_type, filter); + break; + } + case 21: { + using GaussianBlur = GaussianBlurMatmul<21>; + GaussianBlur inner_filter(sigma_x, kernel_buffer); + MatmulFilter> filter( + inner_filter); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, + fixed_border_type, filter); + break; + } + + default: + assert(!"kernel size not implemented"); + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + return KLEIDICV_OK; +} + +template +static kleidicv_error_t call_gaussian_blur( + std::index_sequence, const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, size_t width, size_t height, + size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, + size_t kernel_height, float sigma_x, float sigma_y, + FixedBorderType fixed_border_type, + kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = gaussian_blur_checks( + src, src_stride, dst, dst_stride, width, height, channels, workspace); + if (kernel_width != kernel_height) { + checks_result = KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + // If no case triggered result = ..., then `channels` variable + // is out of range + kleidicv_error_t exec_result = KLEIDICV_ERROR_NOT_IMPLEMENTED; + ((channels == C ? (exec_result = gaussian_blur( + src, src_stride, dst, dst_stride, width, height, + y_begin, y_end, kernel_width, kernel_height, sigma_x, + sigma_y, fixed_border_type, workspace), + void()) + : void()), + ...); + return exec_result; +} + +static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sme2( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, + FixedBorderType fixed_border_type, + kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + if (!gaussian_blur_sme2_implementation_checks(kernel_width, kernel_height, + channels)) { + return gaussian_blur_fixed_stripe_u8_sc( + src, src_stride, dst, dst_stride, width, height, y_begin, y_end, + channels, kernel_width, kernel_height, sigma_x, sigma_y, + fixed_border_type, context); + } + + constexpr std::index_sequence<1, 3, 4> supported_channels; + return call_gaussian_blur(supported_channels, src, src_stride, dst, + dst_stride, width, height, y_begin, y_end, channels, + kernel_width, kernel_height, sigma_x, sigma_y, + fixed_border_type, context); +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif diff --git a/kleidicv/src/filters/gaussian_blur_sme.h b/kleidicv/src/filters/gaussian_blur_sme.h new file mode 100644 index 0000000000000000000000000000000000000000..561b4df9d337bddc1547c969425b9908b8e2d5de --- /dev/null +++ b/kleidicv/src/filters/gaussian_blur_sme.h @@ -0,0 +1,608 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef GAUSSIAN_BLUR_SME_H +#define GAUSSIAN_BLUR_SME_H + +#include +#include + +#include +#include +#include +#include + +#include "gaussian_blur_sc.h" +#include "kleidicv/config.h" +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/matmul.h" +#include "kleidicv/filters/matmul_filter_checks.h" +#include "kleidicv/filters/sigma.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/types.h" +#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_types.h" +#include "kleidicv/workspace/matmul.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class GaussianBlurMatmul { + public: + using SourceType = uint8_t; + using BufferType = uint8_t; + using DestinationType = uint8_t; + using BorderInfoType = BorderInfo; + using BorderType = FixedBorderType; + + static constexpr size_t kKernelSize = KernelSize; + static constexpr size_t kBorderSize = KernelSize / 2; + static constexpr size_t kKernelIterationStep = 4; + + // Class with iterations steps info for filter class + template + class IterationsInfo { + public: + IterationsInfo() KLEIDICV_STREAMING_COMPATIBLE = default; + size_t horizontal_col_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return svcntw(); + } + + size_t horizontal_row_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + // Data layout for 1 channel differs on transposition step + if constexpr (Channels == 4 || Channels == 3) { + return svcntw(); + } else if constexpr (Channels == 1) { + return svcntb(); + } else { + static_assert(false); + } + } + + size_t vertical_row_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return svcntw(); + } + + size_t vertical_col_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return svcntb(); + } + + ptrdiff_t kernel_block_size() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return GaussianBlurMatmul::kernel_block_size(); + } + + size_t kernel_iteration_step() const KLEIDICV_STREAMING_COMPATIBLE + KLEIDICV_PRESERVES_ZA { + return kKernelIterationStep; + } + }; + + explicit GaussianBlurMatmul(float sigma, uint8_t *kernel_buffer) + : kernel(generate_gaussian_float_kernel(sigma)) { + build_kernel_helper_buffer(kernel_buffer); + kernel_rows = Rows{kernel_buffer, svcntsw()}; + } + + // Apply gaussian kernel's Toeplitz matrix horizontally and get + // [row_start, col_start]x[row_start + horizontal_row_step, col_start + + // horizontal_col_step] block of output matrix + template + KLEIDICV_LOCALLY_STREAMING void horizontal_path( + Rows transposed_rows, Rows dst_rows, + Rectangle rect, size_t col_start, size_t row_start, + BorderInfoType border_info) KLEIDICV_INOUT_ZA { + const ptrdiff_t col = static_cast(col_start) - kBorderSize; + const size_t block_size = kernel_block_size(); + + for (size_t kernel_block_row = 0; kernel_block_row < block_size; + kernel_block_row += kKernelIterationStep) { + horizontal_fma_part( + kernel_block_row, col, transposed_rows, border_info); + } + + horizontal_store_part(dst_rows, rect, col_start, row_start); + svzero_za(); + } + + // Apply gaussian kernel's Toeplitz matrix vertically and get + // [row_start, col_start]x[row_start + horizontal_row_step, col_start + + // horizontal_col_step] block of output matrix + template + KLEIDICV_LOCALLY_STREAMING void vertical_path( + Rows src, Rows dst, Rectangle rect, + Rectangle padded_rect, size_t col_start, size_t row_start, + BorderInfoType border_info) KLEIDICV_INOUT_ZA { + svbool_t pred_row = svwhilelt_b8(col_start, rect.width() * Channels); + const ptrdiff_t row = static_cast(row_start) - kBorderSize; + const ptrdiff_t col = static_cast(col_start); + const ptrdiff_t block_size = static_cast(kernel_block_size()); + const ptrdiff_t padded_height = + static_cast(padded_rect.height()); + + for (ptrdiff_t kernel_block_row = 0; kernel_block_row < block_size && + kernel_block_row + row < padded_height; + kernel_block_row += kKernelIterationStep) { + vertical_fma_part(kernel_block_row, row, col, + pred_row, src, border_info); + } + + vertical_store_part(dst, rect, col_start, row_start); + svzero_za(); + } + + private: + // To avoid iterating over whole kernel's matrix, iterations are + // done over non-zero blocks that covers diagonal strip of kernel + // values. Since vector length that covers kernel's matrix is + // SVLW, then amount of non-zero rows is SVLW + K - 1 + static constexpr size_t kernel_block_size() KLEIDICV_STREAMING_COMPATIBLE { + return svcntw() + kKernelSize - 1; + } + + // Build helper buffer in which kernel's Toeplitz matrix block pattern + // will be stored. Since UMOPA for uint8 is being used, this + // buffer will contain rows in the format (zipped) that UMOPA + // expects to avoid zipping vectors on processing stage + void build_kernel_helper_buffer(uint8_t *kernel_buffer) { + size_t svlw = svcntsw(); + size_t kernel_buffer_size = svlw + kKernelSize - 1; + kernel_buffer_size = (kernel_buffer_size + kKernelIterationStep - 1) / + kKernelIterationStep * kKernelIterationStep; + + size_t index = 0; + for (size_t row = 0; row < kernel_buffer_size; + row += kKernelIterationStep) { + size_t row_start = row; + for (size_t col = 0; col < svlw; col++) { + for (size_t r = 0; r < kKernelIterationStep; r++) { + size_t res_id = row_start - col + r; + uint8_t tmp = (res_id >= kKernelSize ? 0 : kernel[res_id]); + uint8_t v = res_id < 0 ? 0 : tmp; + kernel_buffer[index++] = v; + } + } + } + } + + template + KLEIDICV_LOCALLY_STREAMING void horizontal_fma_part( + ptrdiff_t kernel_block_row, ptrdiff_t col, + Rows transposed_rows, + BorderInfoType border_info) KLEIDICV_INOUT_ZA { + // Kernel data is already prepared for use in build_kernel_helper_buffer. + // No additional zip is needed. + svuint8_t svkern = + svld1_u8(svptrue_b8(), &kernel_rows.at(kernel_block_row)[0]); + + const ptrdiff_t col0 = col + kernel_block_row + 0; + const ptrdiff_t col1 = col + kernel_block_row + 1; + const ptrdiff_t col2 = col + kernel_block_row + 2; + const ptrdiff_t col3 = col + kernel_block_row + 3; + svuint8_t svcol0, svcol1, svcol2, svcol3; + + if constexpr (EnableBorderTranslation) { + svcol0 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col0))[0]); + svcol1 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col1))[0]); + svcol2 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col2))[0]); + svcol3 = + svld1_u8(svptrue_b8(), + &transposed_rows.at(border_info.translate_index(col3))[0]); + } else { + svcol0 = svld1_u8(svptrue_b8(), &transposed_rows.at(col0)[0]); + svcol1 = svld1_u8(svptrue_b8(), &transposed_rows.at(col1)[0]); + svcol2 = svld1_u8(svptrue_b8(), &transposed_rows.at(col2)[0]); + svcol3 = svld1_u8(svptrue_b8(), &transposed_rows.at(col3)[0]); + } + + svuint8x4_t svcols = svzip_u8_x4(svcreate4(svcol0, svcol1, svcol2, svcol3)); + + svmopa_za32_u8_m(0, svptrue_b8(), svptrue_b8(), svget4(svcols, 0), svkern); + svmopa_za32_u8_m(1, svptrue_b8(), svptrue_b8(), svget4(svcols, 1), svkern); + svmopa_za32_u8_m(2, svptrue_b8(), svptrue_b8(), svget4(svcols, 2), svkern); + svmopa_za32_u8_m(3, svptrue_b8(), svptrue_b8(), svget4(svcols, 3), svkern); + } + + template + KLEIDICV_LOCALLY_STREAMING void vertical_fma_part( + ptrdiff_t kernel_block_row, ptrdiff_t row, ptrdiff_t col, + svbool_t pred_row, Rows src, + BorderInfoType border_info) KLEIDICV_INOUT_ZA { + // Kernel data is already prepared for use in build_kernel_helper_buffer. + // No additional zip is needed. + svuint8_t svkern = + svld1_u8(svptrue_b8(), &kernel_rows.at(kernel_block_row)[0]); + + const ptrdiff_t row0 = row + kernel_block_row + 0; + const ptrdiff_t row1 = row + kernel_block_row + 1; + const ptrdiff_t row2 = row + kernel_block_row + 2; + const ptrdiff_t row3 = row + kernel_block_row + 3; + + svuint8_t svcol0, svcol1, svcol2, svcol3; + if constexpr (EnableBorderTranslation) { + svcol0 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row0))[col]); + svcol1 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row1))[col]); + svcol2 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row2))[col]); + svcol3 = + svld1_u8(pred_row, &src.at(border_info.translate_index(row3))[col]); + } else { + svcol0 = svld1_u8(pred_row, &src.at(row0)[col]); + svcol1 = svld1_u8(pred_row, &src.at(row1)[col]); + svcol2 = svld1_u8(pred_row, &src.at(row2)[col]); + svcol3 = svld1_u8(pred_row, &src.at(row3)[col]); + } + + svuint8x4_t svcols = svzip_u8_x4(svcreate4(svcol0, svcol1, svcol2, svcol3)); + + svmopa_za32_u8_m(0, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 0)); + svmopa_za32_u8_m(1, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 1)); + svmopa_za32_u8_m(2, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 2)); + svmopa_za32_u8_m(3, svptrue_b8(), svptrue_b8(), svkern, svget4(svcols, 3)); + } + + // Store approach depends on channels. 1 channel images processed by SVLB + // stripes, while 3/4 channels processed by SVLW stripes and need additional + // interleaved stores/zips to respect elements order. + template + KLEIDICV_LOCALLY_STREAMING void horizontal_store_part( + Rows dst, Rectangle rect, size_t col, + size_t row_start) KLEIDICV_INOUT_ZA; + + template <> + KLEIDICV_LOCALLY_STREAMING void horizontal_store_part<1>( + Rows dst, Rectangle rect, size_t col, + size_t row_start) KLEIDICV_INOUT_ZA { + constexpr size_t za32_tiles_count = 4; + horizontal_1_channel_store_all_za_tiles( + std::make_index_sequence{}, dst, rect, row_start, col, + svwhilelt_b32(col, rect.width())); + } + + template + KLEIDICV_LOCALLY_STREAMING void horizontal_1_channel_store_all_za_tiles( + std::index_sequence, Rows dst, Rectangle rect, + size_t row_start, size_t col, svbool_t col_pred) KLEIDICV_INOUT_ZA { + (horizontal_1_channel_store_single_za_tile(dst, rect, row_start, col, + col_pred), + ...); + } + + template + KLEIDICV_LOCALLY_STREAMING void horizontal_1_channel_store_single_za_tile( + Rows dst, Rectangle rect, size_t row_start, size_t col, + svbool_t col_pred) KLEIDICV_INOUT_ZA { + for (size_t row = I * svcntw(); + row < (I + 1) * svcntw() && row_start + row < rect.height(); row++) { + svuint32_t res = postprocess_vector( + svread_hor_za32_u32_m(svundef_u32(), svptrue_b32(), I, row)); + auto *dst_row = &dst.at(static_cast(row_start + row), + static_cast(col))[0]; + svst1b_u32(col_pred, dst_row, res); + } + } + + template <> + KLEIDICV_LOCALLY_STREAMING void horizontal_store_part<3>( + Rows dst, Rectangle rect, size_t col, + size_t row_start) KLEIDICV_INOUT_ZA { + constexpr size_t channels = 3; + svbool_t col_pred = svwhilelt_b8(channels * col, channels * rect.width()); + // Current implementation underutilizes tiles for 3 channels and + // last quarter of the result vector will always be empty. + svbool_t chs3_pred = + svwhilelt_b8(static_cast(0), channels * svcntw()); + svbool_t pred = svand_b_z(svptrue_b8(), col_pred, chs3_pred); + + // To interleave vectors in 3-channel way svtlb is used. + // To build result vector indices should be in the following format: + // 0, SVL/4, SVL/2, 1, SVL/4 + 1, SVL/2 + 1, ... + // <--0th pixel--> <---- 1st pixel ----> + // This corresponds to the sequence (i % 3) * SVL / 4 + i / 3 + svuint8_t indices = svindex_u8(0, 1); + // floor(2^8 / 3) + 1 == 86, used to divide by 3 without + // division. + svuint8_t indices_div3 = + svmulh_u8_z(svptrue_b8(), indices, svdup_n_u8(86U)); + svuint8_t indices_mod3 = + svsub_u8_z(svptrue_b8(), indices, + svmul_z(svptrue_b8(), svdup_u8(3), indices_div3)); + svuint8_t indices_final = svadd_z( + svptrue_b8(), indices_div3, + svrshr_n_u8_z(svptrue_b8(), + svmul_z(svptrue_b8(), svdup_u8(svcntb()), indices_mod3), + 2)); + + for (size_t row = 0; row < svcntw() && row_start + row < rect.height(); + row++) { + svuint32x4_t resu = + (svcreate4(postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 0, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 1, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 2, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 3, row)))); + + svuint8_t svres = svqcvt_u8(resu); + svres = svtbl(svres, indices_final); + svst1(pred, + &dst.at(static_cast(row_start + row), + static_cast(col))[0], + svres); + } + } + + template <> + KLEIDICV_LOCALLY_STREAMING void horizontal_store_part<4>( + Rows dst, Rectangle rect, size_t col, + size_t row_start) KLEIDICV_INOUT_ZA { + constexpr size_t channels = 4; + svbool_t col_pred = svwhilelt_b8(channels * col, channels * rect.width()); + for (size_t row = 0; row < svcntw() && row_start + row < rect.height(); + row++) { + svuint32x4_t resu = + (svcreate4(postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 0, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 1, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 2, row)), + postprocess_vector_no_clamp(svread_hor_za32_u32_m( + svdup_u32(0), svptrue_b32(), 3, row)))); + + resu = svzip_u32_x4(resu); + svst1(col_pred, + &dst.at(static_cast(row_start + row), + static_cast(col))[0], + svqcvt_u8(resu)); + } + } + + template + KLEIDICV_LOCALLY_STREAMING void vertical_store_part( + Rows dst, Rectangle rect, size_t col_start, + size_t row_start) KLEIDICV_INOUT_ZA { + size_t svl = svcntw(); + svbool_t pred_row0 = + svwhilelt_b32(col_start + 0 * svcntw(), rect.width() * Channels); + svbool_t pred_row1 = + svwhilelt_b32(col_start + 1 * svcntw(), rect.width() * Channels); + svbool_t pred_row2 = + svwhilelt_b32(col_start + 2 * svcntw(), rect.width() * Channels); + svbool_t pred_row3 = + svwhilelt_b32(col_start + 3 * svcntw(), rect.width() * Channels); + + for (size_t row = 0; row < svl && (row_start + row) < rect.height(); + row++) { + svuint32x4_t resu = + svcreate4(postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 0, row)), + postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 1, row)), + postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 2, row)), + postprocess_vector(svread_hor_za32_u32_m( + svdup_u32(0.0), svptrue_b32(), 3, row))); + + auto *dst_row = &dst.at(static_cast( + row_start + row))[static_cast(col_start)]; + + auto *KLEIDICV_RESTRICT dst_row0 = dst_row + 0 * svcntw(); + auto *KLEIDICV_RESTRICT dst_row1 = dst_row + 1 * svcntw(); + auto *KLEIDICV_RESTRICT dst_row2 = dst_row + 2 * svcntw(); + auto *KLEIDICV_RESTRICT dst_row3 = dst_row + 3 * svcntw(); + + svst1b_u32(pred_row0, dst_row0, svget4(resu, 0)); + svst1b_u32(pred_row1, dst_row1, svget4(resu, 1)); + svst1b_u32(pred_row2, dst_row2, svget4(resu, 2)); + svst1b_u32(pred_row3, dst_row3, svget4(resu, 3)); + } + } + + KLEIDICV_LOCALLY_STREAMING svuint32_t + postprocess_vector_no_clamp(svuint32_t v) KLEIDICV_PRESERVES_ZA { + return svrshr_n_u32_x(svptrue_b32(), v, 8); + } + + KLEIDICV_LOCALLY_STREAMING svuint32_t postprocess_vector(svuint32_t v) + KLEIDICV_PRESERVES_ZA { + return svclamp_u32(svdup_u32(0), svdup_u32(255), + postprocess_vector_no_clamp(v)); + } + + std::array kernel; + Rows kernel_rows; +}; + +// Class to transpose image data for horizontal processing +template +class Transposer { + public: + // Data layout depends on channels amount. + // + // 1) For 1 channel, row in the output buffer will correspond to a column + // part + // of an image for rows [row_start, row_start + SVLB]. + // + // 2) For 4 channel, row in the output buffer will correspond to a column + // part + // of an image for rows [row_start, row_start + SVLW], where first SVLW + // elements are 0th channel elements, second SVLW elements are + // corresponding 1th channel elements and so on. + // + // 3) For 3 channel the layout is the same as for 4 channel, however last + // SVLW elements are + // not used (consequently last tile won't be used for processing). + template + KLEIDICV_LOCALLY_STREAMING void transpose(Rows src_rows, + Rows transpose_buffer, + Rectangle rect, size_t row_start, + size_t rows) KLEIDICV_INOUT_ZA { + const size_t svlb = svcntb(); + const size_t za_channel_padding = svlb >> 2; + + svzero_za(); + + for (size_t col = 0; col < rect.width(); col += svlb) { + svbool_t pred = svwhilelt_b8(col, rect.width()); + + for (size_t row = 0; row < rows; row++) { + auto *p = &src_rows.at(static_cast(row_start + row), + static_cast(col))[0]; + if constexpr (Channels == 4) { + svuint8x4_t c = svld4(pred, p); + svwrite_hor_za8_m(0, row + 0 * za_channel_padding, svptrue_b8(), + svget4(c, 0)); + svwrite_hor_za8_m(0, row + 1 * za_channel_padding, svptrue_b8(), + svget4(c, 1)); + svwrite_hor_za8_m(0, row + 2 * za_channel_padding, svptrue_b8(), + svget4(c, 2)); + svwrite_hor_za8_m(0, row + 3 * za_channel_padding, svptrue_b8(), + svget4(c, 3)); + } else if constexpr (Channels == 1) { + svuint8_t c = svld1(pred, p); + svwrite_hor_za8_m(0, row, svptrue_b8(), c); + } else if constexpr (Channels == 3) { + svuint8x3_t c = svld3(pred, p); + svwrite_hor_za8_m(0, row + 0 * za_channel_padding, svptrue_b8(), + svget3(c, 0)); + svwrite_hor_za8_m(0, row + 1 * za_channel_padding, svptrue_b8(), + svget3(c, 1)); + svwrite_hor_za8_m(0, row + 2 * za_channel_padding, svptrue_b8(), + svget3(c, 2)); + } else { + static_assert(false, "Unsupported amount of channels for transpose"); + } + } + + for (size_t i = 0; i < svlb; i++) { + svst1_ver_za8(0, i, svptrue_b8(), &transpose_buffer.at(col + i)[0]); + } + svzero_za(); + } + } + + private: +}; + +template +static kleidicv_error_t gaussian_blur( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, + size_t kernel_width, size_t, float sigma_x, float, + FixedBorderType fixed_border_type, + MatmulSeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + Rectangle rect{width, height}; + Rows src_rows{src, src_stride, Channels}; + Rows dst_rows{dst, dst_stride, Channels}; + uint8_t *kernel_buffer = + reinterpret_cast(workspace->get_kernel_buffer()); + + switch (kernel_width) { + case 7: { + using GaussianBlur = GaussianBlurMatmul<7>; + GaussianBlur inner_filter(sigma_x, kernel_buffer); + MatmulFilter> filter( + inner_filter); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, + fixed_border_type, filter); + break; + } + case 15: { + using GaussianBlur = GaussianBlurMatmul<15>; + GaussianBlur inner_filter(sigma_x, kernel_buffer); + MatmulFilter> filter( + inner_filter); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, + fixed_border_type, filter); + break; + } + case 21: { + using GaussianBlur = GaussianBlurMatmul<21>; + GaussianBlur inner_filter(sigma_x, kernel_buffer); + MatmulFilter> filter( + inner_filter); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, + fixed_border_type, filter); + break; + } + + default: + assert(!"kernel size not implemented"); + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + return KLEIDICV_OK; +} + +template +static kleidicv_error_t call_gaussian_blur( + std::index_sequence, const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, size_t width, size_t height, + size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, + size_t kernel_height, float sigma_x, float sigma_y, + FixedBorderType fixed_border_type, + kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = gaussian_blur_checks( + src, src_stride, dst, dst_stride, width, height, channels, workspace); + if (kernel_width != kernel_height) { + checks_result = KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + // If no case triggered result = ..., then `channels` variable + // is out of range + kleidicv_error_t exec_result = KLEIDICV_ERROR_NOT_IMPLEMENTED; + ((channels == C ? (exec_result = gaussian_blur( + src, src_stride, dst, dst_stride, width, height, + y_begin, y_end, kernel_width, kernel_height, sigma_x, + sigma_y, fixed_border_type, workspace), + void()) + : void()), + ...); + return exec_result; +} + +static kleidicv_error_t gaussian_blur_stripe_u8_sme( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, + size_t kernel_width, size_t kernel_height, float sigma_x, float sigma_y, + FixedBorderType fixed_border_type, + kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + if (!gaussian_blur_sme_implementation_checks(kernel_width, kernel_height, + channels)) { + return gaussian_blur_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, y_begin, y_end, channels, + kernel_width, kernel_height, sigma_x, + sigma_y, fixed_border_type, context); + } + + constexpr std::index_sequence<1, 3, 4> supported_channels; + return call_gaussian_blur(supported_channels, src, src_stride, dst, + dst_stride, width, height, y_begin, y_end, channels, + kernel_width, kernel_height, sigma_x, sigma_y, + fixed_border_type, context); +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif diff --git a/kleidicv/src/filters/separable_filter_2d_api.cpp b/kleidicv/src/filters/separable_filter_2d_api.cpp index 9bebc29d7c775682230712aa06c88c202b8fa3b4..cc07280e5da9bb105cfa3b7db651c4f1bf9d5a5f 100644 --- a/kleidicv/src/filters/separable_filter_2d_api.cpp +++ b/kleidicv/src/filters/separable_filter_2d_api.cpp @@ -6,6 +6,7 @@ #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/kleidicv.h" #include "kleidicv/workspace/separable.h" +#include "kleidicv/workspace/workspace_factory.h" namespace kleidicv { @@ -57,6 +58,19 @@ KLEIDICV_DEFINE_C_API(kleidicv_separable_filter_2d_stripe_u8, uint8_t); KLEIDICV_DEFINE_C_API(kleidicv_separable_filter_2d_stripe_u16, uint16_t); KLEIDICV_DEFINE_C_API(kleidicv_separable_filter_2d_stripe_s16, int16_t); +KLEIDICV_MULTIVERSION_C_API(create_separable_filter_workspace, + &kleidicv::neon::create_separable_filter_workspace, + &kleidicv::sve2::create_separable_filter_workspace, + &kleidicv::sme::create_separable_filter_workspace, + &kleidicv::sme2::create_separable_filter_workspace); + +KLEIDICV_MULTIVERSION_C_API( + release_separable_filter_workspace, + &kleidicv::neon::release_separable_filter_workspace, + &kleidicv::sve2::release_separable_filter_workspace, + &kleidicv::sme::release_separable_filter_workspace, + &kleidicv::sme2::release_separable_filter_workspace); + extern "C" { using KLEIDICV_TARGET_NAMESPACE::Rectangle; @@ -81,28 +95,23 @@ kleidicv_error_t kleidicv_filter_context_create( // As we cannot predict the intermediate size based on the parameters given, // just use the largest possible size out of all available operations. constexpr size_t intermediate_size = sizeof(uint32_t); - auto workspace = SeparableFilterWorkspace::create( - Rectangle{max_image_width, max_image_height}, max_channels, - intermediate_size); + auto *workspace = create_separable_filter_workspace( + max_image_width, max_image_height, max_kernel_width, max_kernel_height, + max_channels, intermediate_size); + if (!workspace) { *context = nullptr; return KLEIDICV_ERROR_ALLOCATION; } - *context = reinterpret_cast(workspace.release()); + *context = reinterpret_cast(workspace); return KLEIDICV_OK; } kleidicv_error_t kleidicv_filter_context_release( kleidicv_filter_context_t *context) { CHECK_POINTERS(context); - - // Deliberately create and immediately destroy a unique_ptr to delete the - // workspace. - // NOLINTBEGIN(bugprone-unused-raii) - SeparableFilterWorkspace::Pointer{ - reinterpret_cast(context)}; - // NOLINTEND(bugprone-unused-raii) + release_separable_filter_workspace(reinterpret_cast(context)); return KLEIDICV_OK; } diff --git a/kleidicv/src/workspace/workspace_factory_neon.cpp b/kleidicv/src/workspace/workspace_factory_neon.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d5d9e8138ec1893094f8e090126576972f3e449 --- /dev/null +++ b/kleidicv/src/workspace/workspace_factory_neon.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/workspace/separable.h" +#include "kleidicv/workspace/workspace_factory.h" + +namespace kleidicv::neon { + +KLEIDICV_TARGET_FN_ATTRS void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t, size_t, + size_t max_channels, size_t intermediate_size) { + Rectangle max_rect(max_image_width, max_image_height); + auto workspace = SeparableFilterWorkspace::create(max_rect, max_channels, + intermediate_size); + + return workspace.release(); +} + +KLEIDICV_TARGET_FN_ATTRS void release_separable_filter_workspace( + void *workspace) { + // Deliberately create and immediately destroy a unique_ptr to delete the + // workspace. + // NOLINTBEGIN(bugprone-unused-raii) + auto ptr = SeparableFilterWorkspace::Pointer{ + reinterpret_cast(workspace)}; + // NOLINTEND(bugprone-unused-raii) +} + +} // namespace kleidicv::neon diff --git a/kleidicv/src/workspace/workspace_factory_sme.cpp b/kleidicv/src/workspace/workspace_factory_sme.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6d01fe8e2e8f34ffbd8fc2be61dfe9ec67553b66 --- /dev/null +++ b/kleidicv/src/workspace/workspace_factory_sme.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/workspace/separable.h" +#include "kleidicv/workspace/workspace_factory.h" + +namespace kleidicv::sme { + +KLEIDICV_TARGET_FN_ATTRS void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t, size_t, + size_t max_channels, size_t intermediate_size) { + Rectangle max_rect(max_image_width, max_image_height); + auto workspace = SeparableFilterWorkspace::create(max_rect, max_channels, + intermediate_size); + + return workspace.release(); +} + +KLEIDICV_TARGET_FN_ATTRS void release_separable_filter_workspace( + void *workspace) { + // Deliberately create and immediately destroy a unique_ptr to delete the + // workspace. + // NOLINTBEGIN(bugprone-unused-raii) + auto ptr = SeparableFilterWorkspace::Pointer{ + reinterpret_cast(workspace)}; + // NOLINTEND(bugprone-unused-raii) +} + +} // namespace kleidicv::sme diff --git a/kleidicv/src/workspace/workspace_factory_sme2.cpp b/kleidicv/src/workspace/workspace_factory_sme2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad2094951af624471a2026c89c4c3d53eaa22f98 --- /dev/null +++ b/kleidicv/src/workspace/workspace_factory_sme2.cpp @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/filters/matmul_filter_checks.h" +#include "kleidicv/workspace/matmul.h" +#include "kleidicv/workspace/separable.h" +#include "kleidicv/workspace/workspace_factory.h" + +namespace kleidicv::sme2 { + +KLEIDICV_TARGET_FN_ATTRS void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t max_kernel_width, + size_t max_kernel_height, size_t max_channels, size_t intermediate_size) { + Rectangle max_rect(max_image_width, max_image_height); + Rectangle max_kernel(max_kernel_width, max_kernel_height); + SeparableFilterWorkspace::Pointer workspace; + if (gaussian_blur_sme2_implementation_checks(max_kernel_width, + max_kernel_height)) { + MatmulBufferSizesPolicy policy(max_rect, max_kernel, max_channels); + workspace = SeparableFilterWorkspace::create(max_rect, max_channels, + intermediate_size, policy); + } else { + workspace = SeparableFilterWorkspace::create(max_rect, max_channels, + intermediate_size); + } + + return workspace.release(); +} + +KLEIDICV_TARGET_FN_ATTRS void release_separable_filter_workspace( + void *workspace) { + // Deliberately create and immediately destroy a unique_ptr to delete the + // workspace. + // NOLINTBEGIN(bugprone-unused-raii) + auto ptr = SeparableFilterWorkspace::Pointer{ + reinterpret_cast(workspace)}; + // NOLINTEND(bugprone-unused-raii) +} + +} // namespace kleidicv::sme2 diff --git a/kleidicv/src/workspace/workspace_factory_sve2.cpp b/kleidicv/src/workspace/workspace_factory_sve2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a83c2a78945d684632819ce6cd0eae940924f43 --- /dev/null +++ b/kleidicv/src/workspace/workspace_factory_sve2.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/workspace/separable.h" +#include "kleidicv/workspace/workspace_factory.h" + +namespace kleidicv::sve2 { + +KLEIDICV_TARGET_FN_ATTRS void *create_separable_filter_workspace( + size_t max_image_width, size_t max_image_height, size_t, size_t, + size_t max_channels, size_t intermediate_size) { + Rectangle max_rect(max_image_width, max_image_height); + auto workspace = SeparableFilterWorkspace::create(max_rect, max_channels, + intermediate_size); + + return workspace.release(); +} + +KLEIDICV_TARGET_FN_ATTRS void release_separable_filter_workspace( + void *workspace) { + // Deliberately create and immediately destroy a unique_ptr to delete the + // workspace. + // NOLINTBEGIN(bugprone-unused-raii) + auto ptr = SeparableFilterWorkspace::Pointer{ + reinterpret_cast(workspace)}; + // NOLINTEND(bugprone-unused-raii) +} + +} // namespace kleidicv::sve2 diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 4bc90b2cb82e4859d54d905ceeae7fcb0061c789..aa8dad7ca22bca8796068429137fc76ba0883a26 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -44,6 +44,8 @@ static constexpr std::array kAllBorders = { }; static constexpr size_t kToleranceOne = 1; +static constexpr size_t kToleranceTwo = 2; +static constexpr size_t kToleranceThree = 3; // Test for GaussianBlur operator. template ; GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .with_sigma(0.01) .test_with_generated_mask(); } @@ -329,14 +333,14 @@ TYPED_TEST(GaussianBlur, 11x11_CustomSigma) { TYPED_TEST(GaussianBlur, 15x15_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .with_sigma(0.01) .test_with_generated_mask(); } @@ -346,14 +350,14 @@ TYPED_TEST(GaussianBlur, 21x21_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; // TODO kReplicateBorder is temporary until we implement all borders GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kAllBorders, kToleranceTwo} .with_sigma(0.01) .test_with_generated_mask(); } @@ -816,7 +820,7 @@ TYPED_TEST(GaussianBlur, ValidImageSize7x7) { expected.set(3, 0, {16, 16, 16, 17, 18, 19}); expected.set(4, 0, {26, 26, 25, 26, 27, 27}); expected.set(5, 0, {32, 31, 29, 29, 30, 30}); - EXPECT_EQ_ARRAY2D(expected, dst); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, expected, dst); EXPECT_EQ(KLEIDICV_OK, gaussian_blur()( src.data(), src.stride(), dst.data(), dst.stride(),