From 9c23bb356ef68d90ca53e1c8b933a19de7ad3523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 26 Apr 2024 16:09:27 +0200 Subject: [PATCH 1/9] Split separable filters into multiple files --- kleidicv/include/kleidicv/neon.h | 260 --------------- .../kleidicv/separable_filter_3x3_neon.h | 152 +++++++++ .../kleidicv/separable_filter_3x3_sc.h | 162 +++++++++ .../kleidicv/separable_filter_5x5_neon.h | 140 ++++++++ .../kleidicv/separable_filter_5x5_sc.h | 180 ++++++++++ kleidicv/include/kleidicv/sve2.h | 308 ------------------ kleidicv/src/filters/gaussian_blur_neon.cpp | 2 + kleidicv/src/filters/gaussian_blur_sc.h | 2 + kleidicv/src/filters/sobel_neon.cpp | 1 + kleidicv/src/filters/sobel_sc.h | 1 + 10 files changed, 640 insertions(+), 568 deletions(-) create mode 100644 kleidicv/include/kleidicv/separable_filter_3x3_neon.h create mode 100644 kleidicv/include/kleidicv/separable_filter_3x3_sc.h create mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_neon.h create mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_sc.h diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h index 7339eb8ee..c5a59dc1a 100644 --- a/kleidicv/include/kleidicv/neon.h +++ b/kleidicv/include/kleidicv/neon.h @@ -325,266 +325,6 @@ void apply_block_operation_by_rows(OperationType &operation, zip_rows(block_operation, std::forward(args)...); } -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr Margin margin() { return Margin{1UL}; } - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(border_offsets.c0())[index]; - auto src_1 = &src_rows.at(border_offsets.c1())[index]; - auto src_2 = &src_rows.at(border_offsets.c2())[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - SourceVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.vertical_vector_path(src_a, &dst_rows[index]); - filter_.vertical_vector_path( - src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - SourceVectorType src[3]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[3]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - BufferVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[3]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr Margin margin() { return Margin{2UL}; } - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[5]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[5]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - - BufferVectorType src_a[5], src_b[5]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[5]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - } // namespace kleidicv::neon #endif // KLEIDICV_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h new file mode 100644 index 000000000..ce507a4c5 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -0,0 +1,152 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H + +#include "kleidicv/neon.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 3x3 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{1UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(border_offsets.c0())[index]; + auto src_1 = &src_rows.at(border_offsets.c1())[index]; + auto src_2 = &src_rows.at(border_offsets.c2())[index]; + + auto src_0_x2 = vld1q_x2(&src_0[0]); + auto src_1_x2 = vld1q_x2(&src_1[0]); + auto src_2_x2 = vld1q_x2(&src_2[0]); + + SourceVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.vertical_vector_path(src_a, &dst_rows[index]); + filter_.vertical_vector_path( + src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + SourceVectorType src[3]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[3]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + + auto src_0_x2 = vld1q_x2(&src_0[0]); + auto src_1_x2 = vld1q_x2(&src_1[0]); + auto src_2_x2 = vld1q_x2(&src_2[0]); + + BufferVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[3]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h new file mode 100644 index 000000000..2d4e0fafb --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_SC_H +#define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H + +#include "kleidicv/sve2.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 3x3 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{1UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, + &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_3X3_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h new file mode 100644 index 000000000..9e0451b2f --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H + +#include "kleidicv/neon.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 5x5 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{2UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[5]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[5]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + + BufferVectorType src_a[5], src_b[5]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[5]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[5]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 5x5 separable filters driver type. +template +using SeparableFilter5x5 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h new file mode 100644 index 000000000..c87f1c39b --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -0,0 +1,180 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_5x5_SC_H +#define KLEIDICV_SEPARABLE_FILTER_5x5_SC_H + +#include "kleidicv/sve2.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 5x5 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{2UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c3())[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c4())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, + &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, + &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[5]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 5x5 separable filters driver type. +template +using SeparableFilter5x5 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_5x5_SC_H diff --git a/kleidicv/include/kleidicv/sve2.h b/kleidicv/include/kleidicv/sve2.h index bedd2cbd6..8f656d7d7 100644 --- a/kleidicv/include/kleidicv/sve2.h +++ b/kleidicv/include/kleidicv/sve2.h @@ -500,314 +500,6 @@ void apply_operation_by_rows(OperationType &operation, zip_rows(row_based_operation, std::forward(args)...); } -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{1UL}; - } - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, - &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { - return Margin{2UL}; - } - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - // Swap two variables, since some C++ Standard Library implementations do not // allow using std::swap for SVE vectors. template diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index 096231d35..49325278f 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -5,6 +5,8 @@ #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_3x3_neon.h" +#include "kleidicv/separable_filter_5x5_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 491bd454e..362d385e1 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -8,6 +8,8 @@ #include #include "kleidicv/kleidicv.h" +#include "kleidicv/separable_filter_3x3_sc.h" +#include "kleidicv/separable_filter_5x5_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 51a419392..869e34788 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -6,6 +6,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_3x3_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 5671232fb..6da923694 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -7,6 +7,7 @@ #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" +#include "kleidicv/separable_filter_3x3_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { -- GitLab From f7bfd8b2512a816f33fc6a7051c1464769b07a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 8 May 2024 09:39:47 +0200 Subject: [PATCH 2/9] Refactor the "buffer_type_size" field Renaming "buffer" to "intermediate" should make it clearer what exactly is meant. Additionally, clarify that the intermediate size must be large enough, as it is not guaranteed to always be double the input type. --- kleidicv/include/kleidicv/kleidicv.h | 11 +++++++---- kleidicv/include/kleidicv/workspace/separable.h | 11 ++++++----- kleidicv/src/filters/gaussian_blur_api.cpp | 9 +++++---- kleidicv/src/filters/gaussian_blur_neon.cpp | 2 +- kleidicv/src/filters/gaussian_blur_sc.h | 2 +- test/api/test_gaussian_blur.cpp | 2 +- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index cb6fe639a..a12a0d203 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1031,20 +1031,23 @@ KLEIDICV_API_DECLARATION(kleidicv_canny_u8, const uint8_t *src, /// Creates a filter context according to the parameters. /// -/// Before a gaussian_blur operation, this initialization is needed. +/// Before a Gaussian blur operation, this initialization is needed. /// After the operation is finished, the context needs to be released /// using @ref kleidicv_filter_release. /// /// @param context Pointer where to return the created context's address. /// @param channels Number of channels in the data. Must be not more than /// @ref KLEIDICV_MAXIMUM_CHANNEL_COUNT. -/// @param type_size Size of buffer element in bytes. It must be double the -/// size of the type the filter operation is executed on. +/// @param intermediate_size Size of an intermediate buffer element in bytes. +/// The element must be large enough to fit values of +/// the intermediate type used internally by the +/// Gaussian blur operation. /// @param image Image dimensions. Its size must not be more than /// @ref KLEIDICV_MAX_IMAGE_PIXELS. /// kleidicv_error_t kleidicv_filter_create(kleidicv_filter_context_t **context, - size_t channels, size_t type_size, + size_t channels, + size_t intermediate_size, kleidicv_rectangle_t image); /// Releases a filter context that was previously created using @ref diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 3cf1671b9..e019a9536 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -81,8 +81,9 @@ class SeparableFilterWorkspace final { // Creates a workspace on the heap. static Pointer create(Rectangle rect, size_t channels, - size_t buffer_type_size) KLEIDICV_STREAMING_COMPATIBLE { - size_t buffer_rows_width = buffer_type_size * rect.width(); + size_t intermediate_size) + KLEIDICV_STREAMING_COMPATIBLE { + size_t buffer_rows_width = intermediate_size * rect.width(); // Adding more elements because of SVE, where interleaving stores are // governed by one predicate. For example, if a predicate requires 7 uint8_t // elements and an algorithm performs widening to 16 bits, the resulting @@ -110,14 +111,14 @@ class SeparableFilterWorkspace final { workspace->buffer_rows_stride_ = buffer_rows_stride; workspace->image_size_ = rect; workspace->channels_ = channels; - workspace->buffer_type_size_ = buffer_type_size; + workspace->intermediate_size_ = intermediate_size; return workspace; } size_t channels() const { return channels_; } Rectangle image_size() const { return image_size_; } - size_t buffer_type_size() const { return buffer_type_size_; } + size_t intermediate_size() const { return intermediate_size_; } // Processes rows vertically first along the full width template @@ -204,7 +205,7 @@ class SeparableFilterWorkspace final { Rectangle image_size_; size_t channels_; - size_t buffer_type_size_; + size_t intermediate_size_; // Workspace area begins here. uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment); diff --git a/kleidicv/src/filters/gaussian_blur_api.cpp b/kleidicv/src/filters/gaussian_blur_api.cpp index eeb6d7f74..3006c47c7 100644 --- a/kleidicv/src/filters/gaussian_blur_api.cpp +++ b/kleidicv/src/filters/gaussian_blur_api.cpp @@ -13,12 +13,13 @@ using KLEIDICV_TARGET_NAMESPACE::Rectangle; using KLEIDICV_TARGET_NAMESPACE::SeparableFilterWorkspace; kleidicv_error_t kleidicv_filter_create(kleidicv_filter_context_t **context, - size_t channels, size_t type_size, + size_t channels, + size_t intermediate_size, kleidicv_rectangle_t image) { CHECK_POINTERS(context); CHECK_RECTANGLE_SIZE(image); - if (type_size > KLEIDICV_MAXIMUM_TYPE_SIZE) { + if (intermediate_size > KLEIDICV_MAXIMUM_TYPE_SIZE) { return KLEIDICV_ERROR_RANGE; } @@ -26,8 +27,8 @@ kleidicv_error_t kleidicv_filter_create(kleidicv_filter_context_t **context, return KLEIDICV_ERROR_RANGE; } - auto workspace = - SeparableFilterWorkspace::create(Rectangle{image}, channels, type_size); + auto workspace = SeparableFilterWorkspace::create(Rectangle{image}, channels, + intermediate_size); if (!workspace) { *context = nullptr; return KLEIDICV_ERROR_ALLOCATION; diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index 49325278f..accbca6ba 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -172,7 +172,7 @@ kleidicv_error_t discrete_gaussian_blur(const ScalarType *src, auto *workspace = reinterpret_cast(context); - if (workspace->buffer_type_size() != 2 * sizeof(ScalarType)) { + if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 362d385e1..ff24cf738 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -166,7 +166,7 @@ kleidicv_error_t discrete_gaussian_blur( auto *workspace = reinterpret_cast(context); - if (workspace->buffer_type_size() != 2 * sizeof(ScalarType)) { + if (workspace->intermediate_size() != 2 * sizeof(ScalarType)) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 12ab7c7ab..f3743828a 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -108,7 +108,7 @@ class GaussianBlurTest : public test::KernelTest { kleidicv_filter_context_t *context = nullptr; auto ret = kleidicv_filter_create( - &context, input->channels(), sizeof(IntermediateType), + &context, input->channels(), 2 * sizeof(InputType), kleidicv_rectangle_t{input->width() / input->channels(), input->height()}); if (ret != KLEIDICV_OK) { -- GitLab From 00b8f453e054411b336e927b8559a2532fef5b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 26 Apr 2024 16:40:24 +0200 Subject: [PATCH 3/9] Add implementation for NEON 7x7 Gaussian blur --- adapters/opencv/kleidicv_hal.cpp | 2 + conformity/opencv/test_gaussian_blur.cpp | 20 +++ .../include/kleidicv/filters/gaussian_blur.h | 23 ++- kleidicv/include/kleidicv/kleidicv.h | 18 ++ .../kleidicv/separable_filter_7x7_neon.h | 154 ++++++++++++++++ kleidicv/include/kleidicv/workspace/borders.h | 164 ++++++++++++++++- kleidicv/src/filters/gaussian_blur_api.cpp | 5 + kleidicv/src/filters/gaussian_blur_neon.cpp | 166 +++++++++++++++++- kleidicv/src/filters/gaussian_blur_sme2.cpp | 9 +- kleidicv/src/filters/gaussian_blur_sve2.cpp | 10 +- test/api/test_gaussian_blur.cpp | 39 +++- 11 files changed, 593 insertions(+), 17 deletions(-) create mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_neon.h diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 380931a3c..67b769b19 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -268,6 +268,8 @@ int gaussian_blur_binomial(const uchar *src_data, size_t src_step, impl = kleidicv_gaussian_blur_3x3_u8; } else if ((kernel_size == 5) && (width >= 5) && (height >= 5)) { impl = kleidicv_gaussian_blur_5x5_u8; + } else if ((kernel_size == 7) && (width >= 7) && (height >= 7)) { + impl = kleidicv_gaussian_blur_7x7_u8; } else { return CV_HAL_ERROR_NOT_IMPLEMENTED; } diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index 87dd565dc..f56f4a933 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -82,6 +82,26 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 5x5, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 5x5, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 5x5, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), + + TEST("Gaussian blur 7x7, BORDER_REFLECT, 1 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT, 2 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT, 3 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + TEST("Gaussian blur 7x7, BORDER_REFLECT, 4 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), + + TEST("Gaussian blur 7x7, BORDER_WRAP, 1 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + TEST("Gaussian blur 7x7, BORDER_WRAP, 2 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + TEST("Gaussian blur 7x7, BORDER_WRAP, 3 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + TEST("Gaussian blur 7x7, BORDER_WRAP, 4 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), + + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), }; // clang-format on return tests; diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index 9d460027a..769c5480f 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -26,6 +26,13 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace neon namespace sve2 { @@ -44,6 +51,13 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace sve2 namespace sme2 { @@ -62,6 +76,13 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + } // namespace sme2 } // namespace kleidicv diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index a12a0d203..17f3298e6 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1074,6 +1074,16 @@ kleidicv_error_t kleidicv_filter_release(kleidicv_filter_context_t *context); /// [ 4, 16, 24, 16, 4 ] /// [ 1, 4, 6, 4, 1 ] /// ``` +/// 7x7 Gaussian Blur filter for uint8_t types: +/// ``` +/// [ 4, 14, 28, 36, 28, 14, 4 ] +/// [ 14, 49, 98, 126, 98, 49, 14 ] +/// [ 28, 98, 196, 252, 196, 98, 28 ] +/// 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] +/// [ 28, 98, 196, 252, 196, 98, 28 ] +/// [ 14, 49, 98, 126, 98, 49, 14 ] +/// [ 4, 14, 28, 36, 28, 14, 4 ] +/// ``` /// /// Width and height are the same for the source and for the destination. Number /// of elements is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. @@ -1122,6 +1132,14 @@ KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_5x5_u8, const uint8_t *src, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +/// @copydoc kleidicv_gaussian_blur_3x3_u8 +/// +KLEIDICV_API_DECLARATION(kleidicv_gaussian_blur_7x7_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); + /// Splits a multi channel source stream into separate 1-channel streams. Width /// and height are the same for the source stream and for all the destination /// streams. Number of pixels is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h new file mode 100644 index 000000000..9936396e6 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H + +#include "kleidicv/neon.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 7x7 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr Margin margin() { return Margin{3UL}; } + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[7]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[7]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + src[5] = src_rows.at(border_offsets.c5())[index]; + src[6] = src_rows.at(border_offsets.c6())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + + BufferVectorType src_a[7], src_b[7]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_5[0]); + src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_6[0]); + src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[7]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[7]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 7x7 separable filters driver type. +template +using SeparableFilter7x7 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H diff --git a/kleidicv/include/kleidicv/workspace/borders.h b/kleidicv/include/kleidicv/workspace/borders.h index 5e5c18be2..028437c5f 100644 --- a/kleidicv/include/kleidicv/workspace/borders.h +++ b/kleidicv/include/kleidicv/workspace/borders.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -270,6 +270,164 @@ class FixedBorderInfo final { FixedBorderType border_type_; }; // end of class FixedBorderInfo +// Border offsets for 7x7 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) + : offsets_{o0, o1, o2, o3, o4, o5, o6} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + size_t c5() const { return offsets_[5]; } + size_t c6() const { return offsets_[6]; } + + private: + size_t offsets_[7]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-3, -2, -1, 0, 1, 2, 3); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(-1, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(2, 1, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(0, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(height_ - 3, height_ - 2, -1, 0, 1, 2, 3); + } else { + return get(height_ - 3, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(3, 2, 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(1, 0, -1, 0, 1, 2, 3); + } else { + return get(-1, -2, -1, 0, 1, 2, 3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 1); + } else { + return get(-3, -2, -1, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 0); + } else { + return get(-3, -2, -1, 0, 0, -1, -2); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 3 - height_); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 2 - height_, 3 - height_); + } else { + return get(-3, -2, -1, 0, 1 - height_, 2 - height_, 3 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 1); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 0, -1); + } else { + return get(-3, -2, -1, 0, -1, -2, -3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 2U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 3U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + // Shorthand for 3x3 filter border type. template using FixedBorderInfo3x3 = FixedBorderInfo; @@ -278,6 +436,10 @@ using FixedBorderInfo3x3 = FixedBorderInfo; template using FixedBorderInfo5x5 = FixedBorderInfo; +// Shorthand for 7x7 filter border type. +template +using FixedBorderInfo7x7 = FixedBorderInfo; + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_WORKSPACE_BORDERS_H diff --git a/kleidicv/src/filters/gaussian_blur_api.cpp b/kleidicv/src/filters/gaussian_blur_api.cpp index 3006c47c7..f42da7913 100644 --- a/kleidicv/src/filters/gaussian_blur_api.cpp +++ b/kleidicv/src/filters/gaussian_blur_api.cpp @@ -61,3 +61,8 @@ KLEIDICV_MULTIVERSION_C_API( kleidicv_gaussian_blur_5x5_u8, &kleidicv::neon::gaussian_blur_5x5_u8, KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_5x5_u8), &kleidicv::sme2::gaussian_blur_5x5_u8); + +KLEIDICV_MULTIVERSION_C_API( + kleidicv_gaussian_blur_7x7_u8, &kleidicv::neon::gaussian_blur_7x7_u8, + KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::gaussian_blur_7x7_u8), + &kleidicv::sme2::gaussian_blur_7x7_u8); diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index accbca6ba..33ce02f7f 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -7,6 +7,7 @@ #include "kleidicv/neon.h" #include "kleidicv/separable_filter_3x3_neon.h" #include "kleidicv/separable_filter_5x5_neon.h" +#include "kleidicv/separable_filter_7x7_neon.h" namespace kleidicv::neon { @@ -81,7 +82,6 @@ class DiscreteGaussianBlur { // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ] // [ 4, 16, 24, 16, 4 ] [ 4 ] // [ 1, 4, 6, 4, 1 ] [ 1 ] -// 5x5 Gaussian Blur filter for uint8_t types. template <> class DiscreteGaussianBlur { public: @@ -90,9 +90,9 @@ class DiscreteGaussianBlur { using DestinationType = uint8_t; DiscreteGaussianBlur() - : const_6_u8_{vmov_n_u8(6)}, - const_6_u16_{vmovq_n_u16(6)}, - const_4_u16_{vmovq_n_u16(4)} {} + : const_6_u8_half_{vdup_n_u8(6)}, + const_6_u16_{vdupq_n_u16(6)}, + const_4_u16_{vdupq_n_u16(4)} {} // Applies vertical filtering vector using SIMD operations. // @@ -102,8 +102,10 @@ class DiscreteGaussianBlur { uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4])); uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3])); uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3])); - uint16x8_t acc_l = vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_); - uint16x8_t acc_h = vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_); + uint16x8_t acc_l = + vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_); + uint16x8_t acc_h = + vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_); acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_); acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_); vst1q(&dst[0], acc_l); @@ -139,11 +141,149 @@ class DiscreteGaussianBlur { } private: - uint8x8_t const_6_u8_; + uint8x8_t const_6_u8_half_; uint16x8_t const_6_u16_; uint16x8_t const_4_u16_; }; // end of class DiscreteGaussianBlur +// Template for 7x7 Gaussian Blur approximation filters. +// +// [ 4, 14, 28, 36, 28, 14, 4 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 28, 98, 196, 252, 196, 98, 28 ] +// F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] = +// [ 28, 98, 196, 252, 196, 98, 28 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 4, 14, 28, 36, 28, 14, 4 ] +// +// [ 2 ] +// [ 7 ] +// [ 14 ] +// = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ] +// [ 14 ] +// [ 7 ] +// [ 2 ] +template <> +class DiscreteGaussianBlur { + public: + using SourceType = uint8_t; + using BufferType = uint16_t; + using DestinationType = uint8_t; + + DiscreteGaussianBlur() + : const_7_u16_{vdupq_n_u16(7)}, + const_7_u32_{vdupq_n_u32(7)}, + const_9_u16_{vdupq_n_u16(9)} {} + + // Applies vertical filtering vector using SIMD operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void vertical_vector_path(uint8x16_t src[7], BufferType *dst) const { + uint16x8_t acc_0_6_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[6])); + uint16x8_t acc_0_6_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[6])); + + uint16x8_t acc_1_5_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[5])); + uint16x8_t acc_1_5_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[5])); + + uint16x8_t acc_2_4_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[4])); + uint16x8_t acc_2_4_h = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[4])); + + uint16x8_t acc_3_l = vmovl_u8(vget_low_u8(src[3])); + uint16x8_t acc_3_h = vmovl_u8(vget_high_u8(src[3])); + + uint16x8_t acc_0_2_4_6_l = vmlaq_u16(acc_0_6_l, acc_2_4_l, const_7_u16_); + uint16x8_t acc_0_2_4_6_h = vmlaq_u16(acc_0_6_h, acc_2_4_h, const_7_u16_); + + uint16x8_t acc_0_2_3_4_6_l = + vmlaq_u16(acc_0_2_4_6_l, acc_3_l, const_9_u16_); + uint16x8_t acc_0_2_3_4_6_h = + vmlaq_u16(acc_0_2_4_6_h, acc_3_h, const_9_u16_); + + acc_0_2_3_4_6_l = vshlq_n_u16(acc_0_2_3_4_6_l, 1); + acc_0_2_3_4_6_h = vshlq_n_u16(acc_0_2_3_4_6_h, 1); + + uint16x8_t acc_0_1_2_3_4_5_6_l = + vmlaq_u16(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u16_); + uint16x8_t acc_0_1_2_3_4_5_6_h = + vmlaq_u16(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u16_); + + vst1q(&dst[0], acc_0_1_2_3_4_5_6_l); + vst1q(&dst[8], acc_0_1_2_3_4_5_6_h); + } + + // Applies vertical filtering vector using scalar operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void vertical_scalar_path(const SourceType src[7], BufferType *dst) const { + uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + src[4] * 14 + src[5] * 7 + src[6] * 2; + dst[0] = acc; + } + + // Applies horizontal filtering vector using SIMD operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_vector_path(uint16x8_t src[7], DestinationType *dst) const { + uint32x4_t acc_0_6_l = + vaddl_u16(vget_low_u16(src[0]), vget_low_u16(src[6])); + uint32x4_t acc_0_6_h = + vaddl_u16(vget_high_u16(src[0]), vget_high_u16(src[6])); + + uint32x4_t acc_1_5_l = + vaddl_u16(vget_low_u16(src[1]), vget_low_u16(src[5])); + uint32x4_t acc_1_5_h = + vaddl_u16(vget_high_u16(src[1]), vget_high_u16(src[5])); + + uint16x8_t acc_2_4 = vaddq_u16(src[2], src[4]); + + uint32x4_t acc_0_2_4_6_l = + vmlal_u16(acc_0_6_l, vget_low_u16(acc_2_4), vget_low_u16(const_7_u16_)); + uint32x4_t acc_0_2_4_6_h = vmlal_u16(acc_0_6_h, vget_high_u16(acc_2_4), + vget_high_u16(const_7_u16_)); + + uint32x4_t acc_0_2_3_4_6_l = vmlal_u16(acc_0_2_4_6_l, vget_low_u16(src[3]), + vget_low_u16(const_9_u16_)); + uint32x4_t acc_0_2_3_4_6_h = vmlal_u16(acc_0_2_4_6_h, vget_high_u16(src[3]), + vget_high_u16(const_9_u16_)); + + acc_0_2_3_4_6_l = vshlq_n_u32(acc_0_2_3_4_6_l, 1); + acc_0_2_3_4_6_h = vshlq_n_u32(acc_0_2_3_4_6_h, 1); + + uint32x4_t acc_0_1_2_3_4_5_6_l = + vmlaq_u32(acc_0_2_3_4_6_l, acc_1_5_l, const_7_u32_); + uint32x4_t acc_0_1_2_3_4_5_6_h = + vmlaq_u32(acc_0_2_3_4_6_h, acc_1_5_h, const_7_u32_); + + uint16x4_t acc_0_1_2_3_4_5_6_u16_l = vrshrn_n_u32(acc_0_1_2_3_4_5_6_l, 12); + uint16x4_t acc_0_1_2_3_4_5_6_u16_h = vrshrn_n_u32(acc_0_1_2_3_4_5_6_h, 12); + + uint16x8_t acc_0_1_2_3_4_5_6_u16 = + vcombine_u16(acc_0_1_2_3_4_5_6_u16_l, acc_0_1_2_3_4_5_6_u16_h); + uint8x8_t acc_0_1_2_3_4_5_6_u8 = vmovn_u16(acc_0_1_2_3_4_5_6_u16); + + vst1(&dst[0], acc_0_1_2_3_4_5_6_u8); + } + + // Applies horizontal filtering vector using scalar operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_scalar_path(const BufferType src[7], + DestinationType *dst) const { + uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + src[4] * 14 + src[5] * 7 + src[6] * 2; + dst[0] = rounding_shift_right(acc, 12); + } + + private: + uint16x8_t const_7_u16_; + uint32x4_t const_7_u32_; + uint16x8_t const_9_u16_; +}; // end of class DiscreteGaussianBlur + template kleidicv_error_t discrete_gaussian_blur(const ScalarType *src, size_t src_stride, ScalarType *dst, @@ -221,4 +361,16 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); +} + } // namespace kleidicv::neon diff --git a/kleidicv/src/filters/gaussian_blur_sme2.cpp b/kleidicv/src/filters/gaussian_blur_sme2.cpp index 9f692b3c0..1ad76b02e 100644 --- a/kleidicv/src/filters/gaussian_blur_sme2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sme2.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -27,4 +27,11 @@ gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, border_type, context); } +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +gaussian_blur_7x7_u8(const uint8_t *, size_t, uint8_t *, size_t, size_t, size_t, + size_t, kleidicv_border_type_t, + kleidicv_filter_context_t *) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; +} + } // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_sve2.cpp b/kleidicv/src/filters/gaussian_blur_sve2.cpp index 12c7176de..10d4c9506 100644 --- a/kleidicv/src/filters/gaussian_blur_sve2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sve2.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -31,4 +31,12 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *, size_t, uint8_t *, + size_t, size_t, size_t, size_t, + kleidicv_border_type_t, + kleidicv_filter_context_t *) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; +} + } // namespace kleidicv::sve2 diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index f3743828a..8166e4703 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -17,6 +17,7 @@ KLEIDICV_GAUSSIAN_BLUR(uint8_t, 3x3, u8); KLEIDICV_GAUSSIAN_BLUR(uint8_t, 5x5, u8); +KLEIDICV_GAUSSIAN_BLUR(uint8_t, 7x7, u8); // Implements KernelTestParams for Gaussian Blur operators. template @@ -25,7 +26,7 @@ struct GaussianBlurKernelTestParams; template struct GaussianBlurKernelTestParams { using InputType = uint8_t; - using IntermediateType = uint16_t; + using IntermediateType = uint32_t; using OutputType = uint8_t; static constexpr size_t kKernelSize = KernelSize; @@ -102,9 +103,12 @@ class GaussianBlurTest : public test::KernelTest { test::Array2D *output, kleidicv_border_type_t border_type, kleidicv_border_values_t) override { - auto api = KernelTestParams::kKernelSize == 3 - ? gaussian_blur_3x3() - : gaussian_blur_5x5(); + // NOLINTBEGIN(readability-avoid-nested-conditional-operator) + auto api = + KernelTestParams::kKernelSize == 3 ? gaussian_blur_3x3() + : KernelTestParams::kKernelSize == 5 ? gaussian_blur_5x5() + : gaussian_blur_7x7(); + // NOLINTEND(readability-avoid-nested-conditional-operator) kleidicv_filter_context_t *context = nullptr; auto ret = kleidicv_filter_create( @@ -129,9 +133,13 @@ class GaussianBlurTest : public test::KernelTest { // Apply rounding to nearest integer division. IntermediateType scale_result(const test::Kernel &kernel, IntermediateType result) override { - return kernel.width() == 3 ? ((result + 8) / 16) : ((result + 128) / 256); + // NOLINTBEGIN(readability-avoid-nested-conditional-operator) + return kernel.width() == 3 ? ((result + 8) / 16) + : kernel.width() == 5 ? ((result + 128) / 256) + : ((result + 2048) / 4096); + // NOLINTEND(readability-avoid-nested-conditional-operator) } -}; // end of class class GaussianBlur3x3Test +}; // end of class GaussianBlurTest using ElementTypes = ::testing::Types; @@ -190,6 +198,25 @@ TYPED_TEST(GaussianBlur, 5x5) { .test(mask); } +// Tests gaussian_blur_7x7_ API. +TYPED_TEST(GaussianBlur, 7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + // 7x7 GaussianBlur operator. + test::Array2D mask{7, 7}; + // clang-format off + mask.set(0, 0, { 4, 14, 28, 36, 28, 14, 4 }); + mask.set(1, 0, { 14, 49, 98, 126, 98, 49, 14 }); + mask.set(2, 0, { 28, 98, 196, 252, 196, 98, 28 }); + mask.set(3, 0, { 36, 126, 252, 324, 252, 126, 36 }); + mask.set(4, 0, { 28, 98, 196, 252, 196, 98, 28 }); + mask.set(5, 0, { 14, 49, 98, 126, 98, 49, 14 }); + mask.set(6, 0, { 4, 14, 28, 36, 28, 14, 4 }); + // clang-format on + GaussianBlurTest{} + .with_border_types(make_generator_ptr(kAllBorders)) + .test(mask); +} + TYPED_TEST(GaussianBlur, UnsupportedBorderType3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; -- GitLab From 938429b1a82006b8e9c32d77dca6c8a5a8bdcf87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 8 May 2024 14:54:31 +0200 Subject: [PATCH 4/9] Add implementation for SVE2/SME2 7x7 Gaussian blur --- .../kleidicv/separable_filter_7x7_sc.h | 196 ++++++++++++++++++ kleidicv/src/filters/gaussian_blur_sc.h | 124 +++++++++++ kleidicv/src/filters/gaussian_blur_sme2.cpp | 11 +- kleidicv/src/filters/gaussian_blur_sve2.cpp | 14 +- 4 files changed, 336 insertions(+), 9 deletions(-) create mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_sc.h diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h new file mode 100644 index 000000000..3ab65bed6 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_7x7_SC_H +#define KLEIDICV_SEPARABLE_FILTER_7x7_SC_H + +#include "kleidicv/sve2.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 7x7 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr Margin margin() KLEIDICV_STREAMING_COMPATIBLE { + return Margin{3UL}; + } + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c0())[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c1())[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c2())[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c3())[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c4())[index]); + SourceVectorType src_5 = + svld1(pg, &src_rows.at(border_offsets.c5())[index]); + SourceVectorType src_6 = + svld1(pg, &src_rows.at(border_offsets.c6())[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, src_0_5, src_0_6, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + BufferVectorType src_5 = + svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); + BufferVectorType src_6 = + svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[7]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 7x7 separable filters driver type. +template +using SeparableFilter7x7 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_7x7_SC_H diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index ff24cf738..5f5127652 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -10,6 +10,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/separable_filter_3x3_sc.h" #include "kleidicv/separable_filter_5x5_sc.h" +#include "kleidicv/separable_filter_7x7_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -139,6 +140,129 @@ class DiscreteGaussianBlur { } }; // end of class DiscreteGaussianBlur +// Template for 7x7 Gaussian Blur approximation filters. +// +// [ 4, 14, 28, 36, 28, 14, 4 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 28, 98, 196, 252, 196, 98, 28 ] +// F = 1/4096 * [ 36, 126, 252, 324, 252, 126, 36 ] = +// [ 28, 98, 196, 252, 196, 98, 28 ] +// [ 14, 49, 98, 126, 98, 49, 14 ] +// [ 4, 14, 28, 36, 28, 14, 4 ] +// +// [ 2 ] +// [ 7 ] +// [ 14 ] +// = 1/4096 * [ 18 ] * [ 2, 7, 14, 18, 14, 7, 2 ] +// [ 14 ] +// [ 7 ] +// [ 2 ] +template <> +class DiscreteGaussianBlur { + public: + using SourceType = uint8_t; + using BufferType = uint16_t; + using DestinationType = uint8_t; + + // Applies vertical filtering vector using SIMD operations. + // + // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void vertical_vector_path( + svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, + svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t const_7_u16 = svdup_n_u16(7); + svuint16_t const_9_u16 = svdup_n_u16(9); + + svuint16_t acc_0_6_b = svaddlb_u16(src_0, src_6); + svuint16_t acc_0_6_t = svaddlt_u16(src_0, src_6); + + svuint16_t acc_1_5_b = svaddlb_u16(src_1, src_5); + svuint16_t acc_1_5_t = svaddlt_u16(src_1, src_5); + + svuint16_t acc_2_4_b = svaddlb_u16(src_2, src_4); + svuint16_t acc_2_4_t = svaddlt_u16(src_2, src_4); + + svuint16_t acc_3_b = svmovlb_u16(src_3); + svuint16_t acc_3_t = svmovlt_u16(src_3); + + svuint16_t acc_0_2_4_6_b = + svmla_u16_x(pg, acc_0_6_b, acc_2_4_b, const_7_u16); + svuint16_t acc_0_2_4_6_t = + svmla_u16_x(pg, acc_0_6_t, acc_2_4_t, const_7_u16); + + svuint16_t acc_0_2_3_4_6_b = + svmla_u16_x(pg, acc_0_2_4_6_b, acc_3_b, const_9_u16); + svuint16_t acc_0_2_3_4_6_t = + svmla_u16_x(pg, acc_0_2_4_6_t, acc_3_t, const_9_u16); + acc_0_2_3_4_6_b = svlsl_n_u16_x(pg, acc_0_2_3_4_6_b, 1); + acc_0_2_3_4_6_t = svlsl_n_u16_x(pg, acc_0_2_3_4_6_t, 1); + + svuint16_t acc_0_1_2_3_4_5_6_b = + svmla_u16_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, const_7_u16); + svuint16_t acc_0_1_2_3_4_5_6_t = + svmla_u16_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, const_7_u16); + + svuint16x2_t interleaved = + svcreate2(acc_0_1_2_3_4_5_6_b, acc_0_1_2_3_4_5_6_t); + svst2(pg, &dst[0], interleaved); + } + + // Applies horizontal filtering vector using SIMD operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_vector_path( + svbool_t pg, svuint16_t src_0, svuint16_t src_1, svuint16_t src_2, + svuint16_t src_3, svuint16_t src_4, svuint16_t src_5, svuint16_t src_6, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t const_7_u16 = svdup_n_u16(7); + svuint16_t const_9_u16 = svdup_n_u16(9); + svuint32_t const_7_u32 = svdup_n_u32(7); + + svuint32_t acc_0_6_b = svaddlb_u32(src_0, src_6); + svuint32_t acc_0_6_t = svaddlt_u32(src_0, src_6); + + svuint32_t acc_1_5_b = svaddlb_u32(src_1, src_5); + svuint32_t acc_1_5_t = svaddlt_u32(src_1, src_5); + + svuint16_t acc_2_4 = svadd_u16_x(pg, src_2, src_4); + + svuint32_t acc_0_2_4_6_b = svmlalb_u32(acc_0_6_b, acc_2_4, const_7_u16); + svuint32_t acc_0_2_4_6_t = svmlalt_u32(acc_0_6_t, acc_2_4, const_7_u16); + + svuint32_t acc_0_2_3_4_6_b = svmlalb_u32(acc_0_2_4_6_b, src_3, const_9_u16); + svuint32_t acc_0_2_3_4_6_t = svmlalt_u32(acc_0_2_4_6_t, src_3, const_9_u16); + + acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1); + acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1); + + svuint32_t acc_0_1_2_3_4_5_6_b = + svmla_u32_x(pg, acc_0_2_3_4_6_b, acc_1_5_b, const_7_u32); + svuint32_t acc_0_1_2_3_4_5_6_t = + svmla_u32_x(pg, acc_0_2_3_4_6_t, acc_1_5_t, const_7_u32); + + svuint16_t acc_0_1_2_3_4_5_6_u16_b = + svrshrnb_n_u32(acc_0_1_2_3_4_5_6_b, 12); + svuint16_t acc_0_1_2_3_4_5_6_u16 = + svrshrnt_n_u32(acc_0_1_2_3_4_5_6_u16_b, acc_0_1_2_3_4_5_6_t, 12); + + svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16); + } + + // Applies horizontal filtering vector using scalar operations. + // + // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * + // * [ 2, 7, 14, 18, 14, 7, 2 ]T + void horizontal_scalar_path(const BufferType src[7], DestinationType *dst) + const KLEIDICV_STREAMING_COMPATIBLE { + uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + + src[4] * 14 + src[5] * 7 + src[6] * 2; + dst[0] = rounding_shift_right(acc, 12); + } +}; // end of class DiscreteGaussianBlur + template kleidicv_error_t discrete_gaussian_blur( const ScalarType *src, size_t src_stride, ScalarType *dst, diff --git a/kleidicv/src/filters/gaussian_blur_sme2.cpp b/kleidicv/src/filters/gaussian_blur_sme2.cpp index 1ad76b02e..33a2dd09d 100644 --- a/kleidicv/src/filters/gaussian_blur_sme2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sme2.cpp @@ -28,10 +28,13 @@ gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, } KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t -gaussian_blur_7x7_u8(const uint8_t *, size_t, uint8_t *, size_t, size_t, size_t, - size_t, kleidicv_border_type_t, - kleidicv_filter_context_t *) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; +gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t channels, kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); } } // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/gaussian_blur_sve2.cpp b/kleidicv/src/filters/gaussian_blur_sve2.cpp index 10d4c9506..7ae808d33 100644 --- a/kleidicv/src/filters/gaussian_blur_sve2.cpp +++ b/kleidicv/src/filters/gaussian_blur_sve2.cpp @@ -32,11 +32,15 @@ kleidicv_error_t gaussian_blur_5x5_u8(const uint8_t *src, size_t src_stride, } KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *, size_t, uint8_t *, - size_t, size_t, size_t, size_t, - kleidicv_border_type_t, - kleidicv_filter_context_t *) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; +kleidicv_error_t gaussian_blur_7x7_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + size_t channels, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return discrete_gaussian_blur(src, src_stride, dst, dst_stride, + width, height, channels, + border_type, context); } } // namespace kleidicv::sve2 -- GitLab From 834e272a6f63efb9299e575b4897bba4a4fae9f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 15 May 2024 09:53:37 +0200 Subject: [PATCH 5/9] Split border definitions into multiple files --- .../kleidicv/separable_filter_3x3_neon.h | 1 + .../kleidicv/separable_filter_3x3_sc.h | 1 + .../kleidicv/separable_filter_5x5_neon.h | 1 + .../kleidicv/separable_filter_5x5_sc.h | 1 + .../kleidicv/separable_filter_7x7_neon.h | 1 + .../kleidicv/separable_filter_7x7_sc.h | 1 + .../include/kleidicv/workspace/border_3x3.h | 116 +++++ .../include/kleidicv/workspace/border_5x5.h | 162 +++++++ .../include/kleidicv/workspace/border_7x7.h | 181 +++++++ .../include/kleidicv/workspace/border_types.h | 39 ++ kleidicv/include/kleidicv/workspace/borders.h | 445 ------------------ .../include/kleidicv/workspace/separable.h | 2 +- 12 files changed, 505 insertions(+), 446 deletions(-) create mode 100644 kleidicv/include/kleidicv/workspace/border_3x3.h create mode 100644 kleidicv/include/kleidicv/workspace/border_5x5.h create mode 100644 kleidicv/include/kleidicv/workspace/border_7x7.h create mode 100644 kleidicv/include/kleidicv/workspace/border_types.h delete mode 100644 kleidicv/include/kleidicv/workspace/borders.h diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h index ce507a4c5..ec03c40ea 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -6,6 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H #include "kleidicv/neon.h" +#include "kleidicv/workspace/border_3x3.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h index 2d4e0fafb..42c178b02 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -6,6 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H #include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_3x3.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h index 9e0451b2f..2694bc3bd 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -6,6 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H #include "kleidicv/neon.h" +#include "kleidicv/workspace/border_5x5.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h index c87f1c39b..2115c1ed0 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -6,6 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_5x5_SC_H #include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_5x5.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h index 9936396e6..0b71b237a 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -6,6 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H #include "kleidicv/neon.h" +#include "kleidicv/workspace/border_7x7.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h index 3ab65bed6..a19e01e3a 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -6,6 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_7x7_SC_H #include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_7x7.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/include/kleidicv/workspace/border_3x3.h b/kleidicv/include/kleidicv/workspace/border_3x3.h new file mode 100644 index 000000000..ecd5627d3 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_3x3.h @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_3X3_H +#define KLEIDICV_WORKSPACE_BORDER_3X3_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 3x3 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + Offsets() = default; + + Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + + private: + size_t offsets_[3]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const { return get(-1, 0, 1); } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(0, 0, 1); + break; + + case FixedBorderType::WRAP: + return get(height_ - 1, 0, 1); + break; + + case FixedBorderType::REVERSE: + return get(1, 0, 1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(-1, 0, 0); + break; + + case FixedBorderType::WRAP: + return get(-1, 0, 1 - height_); + break; + + case FixedBorderType::REVERSE: + return get(-1, 0, -1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index == 0U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index == (height_ - 1U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2) const { + return Offsets{o0, o1, o2}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 3x3 filter border type. +template +using FixedBorderInfo3x3 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_3X3_H diff --git a/kleidicv/include/kleidicv/workspace/border_5x5.h b/kleidicv/include/kleidicv/workspace/border_5x5.h new file mode 100644 index 000000000..06c2683bd --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_5x5.h @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_5X5_H +#define KLEIDICV_WORKSPACE_BORDER_5X5_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 5x5 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4) + : offsets_{o0, o1, o2, o3, o4} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + + private: + size_t offsets_[5]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-2, -1, 0, 1, 2); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 1, 2); + } else { + return get(-1, -1, 0, 1, 2); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(1, 0, 0, 1, 2); + } else { + return get(-1, -1, 0, 1, 2); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 2, height_ - 1, 0, 1, 2); + } else { + return get(height_ - 2, -1, 0, 1, 2); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(2, 1, 0, 1, 2); + } else { + return get(0, -1, 0, 1, 2); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 1); + } else { + return get(-2, -1, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 1); + } else { + return get(-2, -1, 0, 0, -1); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 2 - height_); + } else { + return get(-2, -1, 0, 1 - height_, 2 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 0); + } else { + return get(-2, -1, 0, -1, -2); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 1U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 2U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, + size_t o4) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 5x5 filter border type. +template +using FixedBorderInfo5x5 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_5X5_H diff --git a/kleidicv/include/kleidicv/workspace/border_7x7.h b/kleidicv/include/kleidicv/workspace/border_7x7.h new file mode 100644 index 000000000..75bb86117 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_7x7.h @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_7X7_H +#define KLEIDICV_WORKSPACE_BORDER_7X7_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 7x7 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) + : offsets_{o0, o1, o2, o3, o4, o5, o6} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + size_t c5() const { return offsets_[5]; } + size_t c6() const { return offsets_[6]; } + + private: + size_t offsets_[7]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-3, -2, -1, 0, 1, 2, 3); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(-1, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(2, 1, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(0, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(height_ - 3, height_ - 2, -1, 0, 1, 2, 3); + } else { + return get(height_ - 3, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(3, 2, 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(1, 0, -1, 0, 1, 2, 3); + } else { + return get(-1, -2, -1, 0, 1, 2, 3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 1); + } else { + return get(-3, -2, -1, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 0); + } else { + return get(-3, -2, -1, 0, 0, -1, -2); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 3 - height_); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 2 - height_, 3 - height_); + } else { + return get(-3, -2, -1, 0, 1 - height_, 2 - height_, 3 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 1); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 0, -1); + } else { + return get(-3, -2, -1, 0, -1, -2, -3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 2U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 3U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 7x7 filter border type. +template +using FixedBorderInfo7x7 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_7X7_H diff --git a/kleidicv/include/kleidicv/workspace/border_types.h b/kleidicv/include/kleidicv/workspace/border_types.h new file mode 100644 index 000000000..0825bc372 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_types.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_TYPES_H +#define KLEIDICV_WORKSPACE_BORDER_TYPES_H + +#include + +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +enum class FixedBorderType { + REPLICATE, + REFLECT, + WRAP, + REVERSE, +}; + +inline std::optional get_fixed_border_type( + kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type) { + case KLEIDICV_BORDER_TYPE_REPLICATE: + return FixedBorderType::REPLICATE; + case KLEIDICV_BORDER_TYPE_REFLECT: + return FixedBorderType::REFLECT; + case KLEIDICV_BORDER_TYPE_WRAP: + return FixedBorderType::WRAP; + case KLEIDICV_BORDER_TYPE_REVERSE: + return FixedBorderType::REVERSE; + default: + return std::optional(); + } +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_TYPES_H diff --git a/kleidicv/include/kleidicv/workspace/borders.h b/kleidicv/include/kleidicv/workspace/borders.h deleted file mode 100644 index 028437c5f..000000000 --- a/kleidicv/include/kleidicv/workspace/borders.h +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_WORKSPACE_BORDERS_H -#define KLEIDICV_WORKSPACE_BORDERS_H - -#include - -#include "kleidicv/kleidicv.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -enum class FixedBorderType { - REPLICATE, - REFLECT, - WRAP, - REVERSE, -}; - -inline std::optional get_fixed_border_type( - kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type) { - case KLEIDICV_BORDER_TYPE_REPLICATE: - return FixedBorderType::REPLICATE; - case KLEIDICV_BORDER_TYPE_REFLECT: - return FixedBorderType::REFLECT; - case KLEIDICV_BORDER_TYPE_WRAP: - return FixedBorderType::WRAP; - case KLEIDICV_BORDER_TYPE_REVERSE: - return FixedBorderType::REVERSE; - default: - return std::optional(); - } -} - -// Border offsets for fixed-size filters. -template -class FixedBorderInfo; - -// Border offsets for 3x3 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - Offsets() = default; - - Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - - private: - size_t offsets_[3]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const { return get(-1, 0, 1); } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - case FixedBorderType::REFLECT: - return get(0, 0, 1); - break; - - case FixedBorderType::WRAP: - return get(height_ - 1, 0, 1); - break; - - case FixedBorderType::REVERSE: - return get(1, 0, 1); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - case FixedBorderType::REFLECT: - return get(-1, 0, 0); - break; - - case FixedBorderType::WRAP: - return get(-1, 0, 1 - height_); - break; - - case FixedBorderType::REVERSE: - return get(-1, 0, -1); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index == 0U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index == (height_ - 1U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2) const { - return Offsets{o0, o1, o2}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Border offsets for 5x5 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - // NOLINTBEGIN(hicpp-member-init) - Offsets() = default; - // NOLINTEND(hicpp-member-init) - - Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4) - : offsets_{o0, o1, o2, o3, o4} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - - private: - size_t offsets_[5]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { - return get(-2, -1, 0, 1, 2); - } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == 0) { - return get(0, 0, 0, 1, 2); - } else { - return get(-1, -1, 0, 1, 2); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == 0) { - return get(1, 0, 0, 1, 2); - } else { - return get(-1, -1, 0, 1, 2); - } - break; - - case FixedBorderType::WRAP: - if (column_index == 0) { - return get(height_ - 2, height_ - 1, 0, 1, 2); - } else { - return get(height_ - 2, -1, 0, 1, 2); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == 0) { - return get(2, 1, 0, 1, 2); - } else { - return get(0, -1, 0, 1, 2); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 1); - } else { - return get(-2, -1, 0, 0, 0); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 1); - } else { - return get(-2, -1, 0, 0, -1); - } - break; - - case FixedBorderType::WRAP: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 2 - height_); - } else { - return get(-2, -1, 0, 1 - height_, 2 - height_); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 0); - } else { - return get(-2, -1, 0, -1, -2); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index <= 1U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index >= (height_ - 2U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, - size_t o4) const KLEIDICV_STREAMING_COMPATIBLE { - return Offsets{o0, o1, o2, o3, o4}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Border offsets for 7x7 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - // NOLINTBEGIN(hicpp-member-init) - Offsets() = default; - // NOLINTEND(hicpp-member-init) - - Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, - size_t o6) - : offsets_{o0, o1, o2, o3, o4, o5, o6} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - size_t c5() const { return offsets_[5]; } - size_t c6() const { return offsets_[6]; } - - private: - size_t offsets_[7]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { - return get(-3, -2, -1, 0, 1, 2, 3); - } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == 0) { - return get(0, 0, 0, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(-1, -1, -1, 0, 1, 2, 3); - } else { - return get(-2, -2, -1, 0, 1, 2, 3); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == 0) { - return get(2, 1, 0, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(0, -1, -1, 0, 1, 2, 3); - } else { - return get(-2, -2, -1, 0, 1, 2, 3); - } - break; - - case FixedBorderType::WRAP: - if (column_index == 0) { - return get(height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(height_ - 3, height_ - 2, -1, 0, 1, 2, 3); - } else { - return get(height_ - 3, -2, -1, 0, 1, 2, 3); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == 0) { - return get(3, 2, 1, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(1, 0, -1, 0, 1, 2, 3); - } else { - return get(-1, -2, -1, 0, 1, 2, 3); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 2); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 1, 1); - } else { - return get(-3, -2, -1, 0, 0, 0, 0); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 2); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 1, 0); - } else { - return get(-3, -2, -1, 0, 0, -1, -2); - } - break; - - case FixedBorderType::WRAP: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 3 - height_); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 2 - height_, 3 - height_); - } else { - return get(-3, -2, -1, 0, 1 - height_, 2 - height_, 3 - height_); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 1); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 0, -1); - } else { - return get(-3, -2, -1, 0, -1, -2, -3); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index <= 2U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index >= (height_ - 3U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, - size_t o6) const KLEIDICV_STREAMING_COMPATIBLE { - return Offsets{o0, o1, o2, o3, o4, o5, o6}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Shorthand for 3x3 filter border type. -template -using FixedBorderInfo3x3 = FixedBorderInfo; - -// Shorthand for 5x5 filter border type. -template -using FixedBorderInfo5x5 = FixedBorderInfo; - -// Shorthand for 7x7 filter border type. -template -using FixedBorderInfo7x7 = FixedBorderInfo; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_WORKSPACE_BORDERS_H diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index e019a9536..b8d036a07 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -8,7 +8,7 @@ #include #include -#include "borders.h" +#include "border_types.h" #include "kleidicv/kleidicv.h" #include "kleidicv/types.h" -- GitLab From 8a27e6f3450ac65339913dcd97432b910f74bb79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 16 May 2024 14:26:48 +0200 Subject: [PATCH 6/9] Add missing tests for the 7x7 kernel --- test/api/test_gaussian_blur.cpp | 175 ++++++++++++++++++++++++++++++-- 1 file changed, 164 insertions(+), 11 deletions(-) diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 8166e4703..da9040dbb 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -259,9 +259,31 @@ TYPED_TEST(GaussianBlur, UnsupportedBorderType5x5) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, UnsupportedBorderType7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + TypeParam src[1] = {}, dst[1]; + for (kleidicv_border_type_t border : { + KLEIDICV_BORDER_TYPE_CONSTANT, + KLEIDICV_BORDER_TYPE_TRANSPARENT, + KLEIDICV_BORDER_TYPE_NONE, + }) { + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()(src, sizeof(TypeParam), dst, + sizeof(TypeParam), validSize, + validSize, 1, border, context)); + } + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, NullPointer) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; ASSERT_EQ(KLEIDICV_OK, @@ -275,6 +297,10 @@ TYPED_TEST(GaussianBlur, NullPointer) { test::test_null_args(gaussian_blur_5x5(), src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context); + validSize = KernelTestParams7x7::kKernelSize - 1; + test::test_null_args(gaussian_blur_7x7(), src, sizeof(TypeParam), + dst, sizeof(TypeParam), validSize, validSize, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, context); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -285,6 +311,7 @@ TYPED_TEST(GaussianBlur, Misalignment) { } using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; ASSERT_EQ(KLEIDICV_OK, @@ -308,6 +335,15 @@ TYPED_TEST(GaussianBlur, Misalignment) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + gaussian_blur_7x7()( + src, sizeof(TypeParam) + 1, dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -355,6 +391,28 @@ TYPED_TEST(GaussianBlur, ZeroImageSize5x5) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, ZeroImageSize7x7) { + TypeParam src[1] = {}, dst[1]; + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{0, 1})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 0, 1, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{1, 0})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), 1, 0, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, ValidImageSize3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; @@ -399,16 +457,41 @@ TYPED_TEST(GaussianBlur, ValidImageSize5x5) { EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } +TYPED_TEST(GaussianBlur, ValidImageSize7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t validSize = KernelTestParams::kKernelSize - 1; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validSize, validSize})); + test::Array2D src{validSize, validSize, + test::Options::vector_length()}; + src.set(0, 0, {1, 2, 3, 4, 5, 6}); + src.set(1, 0, {1, 2, 3, 4, 5, 6}); + src.set(2, 0, {1, 2, 3, 4, 5, 6}); + src.set(3, 0, {1, 2, 3, 4, 5, 6}); + src.set(4, 0, {1, 2, 3, 4, 5, 6}); + src.set(5, 0, {1, 2, 3, 4, 5, 6}); + + test::Array2D dst{validSize, validSize, + test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, + gaussian_blur_7x7()( + src.data(), src.stride(), dst.data(), dst.stride(), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REVERSE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + TYPED_TEST(GaussianBlur, UndersizeImage3x3) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t underSize = KernelTestParams::kKernelSize - 2; size_t validWidth = KernelTestParams::kKernelSize + 10; size_t validHeight = KernelTestParams::kKernelSize + 5; - TypeParam src[1], dst[1]; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), kleidicv_rectangle_t{underSize, underSize})); + TypeParam src[1] = {}, dst[1]; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, gaussian_blur_3x3()( src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, @@ -436,9 +519,8 @@ TYPED_TEST(GaussianBlur, UndersizeImage5x5) { using KernelTestParams = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t underSize = KernelTestParams::kKernelSize - 2; - size_t width = KernelTestParams::kKernelSize + 8; - size_t height = KernelTestParams::kKernelSize + 3; - + size_t validWidth = KernelTestParams::kKernelSize + 8; + size_t validHeight = KernelTestParams::kKernelSize + 3; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), kleidicv_rectangle_t{underSize, underSize})); @@ -448,20 +530,53 @@ TYPED_TEST(GaussianBlur, UndersizeImage5x5) { src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), - kleidicv_rectangle_t{underSize, height})); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{underSize, validHeight})); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, - height, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + validHeight, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validWidth, underSize})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_5x5()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validWidth, + underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); +} + +TYPED_TEST(GaussianBlur, UndersizeImage7x7) { + using KernelTestParams = GaussianBlurKernelTestParams; + kleidicv_filter_context_t *context = nullptr; + size_t underSize = KernelTestParams::kKernelSize - 2; + size_t validWidth = KernelTestParams::kKernelSize + 6; + size_t validHeight = KernelTestParams::kKernelSize + 1; ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create(&context, 1, 2 * sizeof(TypeParam), - kleidicv_rectangle_t{width, underSize})); + kleidicv_rectangle_t{underSize, underSize})); + TypeParam src[1] = {}, dst[1]; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - gaussian_blur_5x5()( - src, sizeof(TypeParam), dst, sizeof(TypeParam), width, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, + underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{underSize, validHeight})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), underSize, + validHeight, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); + ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_create( + &context, 1, 2 * sizeof(TypeParam), + kleidicv_rectangle_t{validWidth, underSize})); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validWidth, underSize, 1, KLEIDICV_BORDER_TYPE_REPLICATE, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } @@ -492,12 +607,23 @@ TYPED_TEST(GaussianBlur, OversizeImage) { src, sizeof(TypeParam), dst, sizeof(TypeParam), KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, ChannelNumber) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -517,12 +643,20 @@ TYPED_TEST(GaussianBlur, ChannelNumber) { src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, KLEIDICV_MAXIMUM_CHANNEL_COUNT + 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, KLEIDICV_MAXIMUM_CHANNEL_COUNT + 1, + KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextSizeType) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -539,12 +673,18 @@ TYPED_TEST(GaussianBlur, InvalidContextSizeType) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -562,12 +702,19 @@ TYPED_TEST(GaussianBlur, InvalidContextChannelNumber) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize, + validSize, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } TYPED_TEST(GaussianBlur, InvalidContextImageSize) { using KernelTestParams3x3 = GaussianBlurKernelTestParams; using KernelTestParams5x5 = GaussianBlurKernelTestParams; + using KernelTestParams7x7 = GaussianBlurKernelTestParams; kleidicv_filter_context_t *context = nullptr; size_t validSize = KernelTestParams3x3::kKernelSize - 1; @@ -585,6 +732,12 @@ TYPED_TEST(GaussianBlur, InvalidContextImageSize) { gaussian_blur_5x5()( src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); + + validSize = KernelTestParams7x7::kKernelSize - 1; + EXPECT_EQ(KLEIDICV_ERROR_CONTEXT_MISMATCH, + gaussian_blur_7x7()( + src, sizeof(TypeParam), dst, sizeof(TypeParam), validSize + 1, + validSize + 1, 1, KLEIDICV_BORDER_TYPE_REFLECT, context)); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_release(context)); } -- GitLab From a33e51294dafe3d303c466daf0f8b52a640b279d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 16 May 2024 16:05:45 +0200 Subject: [PATCH 7/9] Add benchmarks for 7x7 Gaussian blur --- benchmark/benchmark.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index b66e4d7be..8da446649 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -131,3 +131,44 @@ static void resize_linear_4x4_f32(benchmark::State& state) { resize_linear(kleidicv_resize_linear_f32, 4, 4, state); } BENCHMARK(resize_linear_4x4_f32); + +template +static void gaussian_blur(Function f, size_t channels, + benchmark::State& state) { + // Setup + std::vector src, dst; + src.resize(image_width * image_height * channels); + dst.resize(image_width * image_height * channels); + + std::mt19937 generator; + std::generate(src.begin(), src.end(), generator); + + kleidicv_filter_context_t* context; + kleidicv_error_t err = + kleidicv_filter_create(&context, channels, 2 * sizeof(T), + kleidicv_rectangle_t{image_width, image_height}); + if (err != KLEIDICV_OK) { + state.SkipWithError("Could not initialize Gaussian blur filter."); + return; + } + + for (auto _ : state) { + // This code gets benchmarked + auto unused = + f(src.data(), image_width, dst.data(), image_width, image_width, + image_height, channels, KLEIDICV_BORDER_TYPE_REFLECT, context); + (void)unused; + } + + (void)kleidicv_filter_release(context); +} + +static void gaussian_blur_7x7_u8_1ch(benchmark::State& state) { + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 1, state); +} +BENCHMARK(gaussian_blur_7x7_u8_1ch); + +static void gaussian_blur_7x7_u8_3ch(benchmark::State& state) { + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 3, state); +} +BENCHMARK(gaussian_blur_7x7_u8_3ch); -- GitLab From 45f552c9806407d85496efe0fb5ff1c6dfa1f0fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 16 May 2024 16:47:53 +0200 Subject: [PATCH 8/9] Update doc/opencv.md Add mention of the new 7x7 kernel size for Gaussian blur. --- doc/opencv.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/opencv.md b/doc/opencv.md index 99e287c56..aa687ec16 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -89,7 +89,7 @@ Currently does not support non-zero margins. Kernel shape is restricted to squar Notes on parameters: * `depth` - only supports `CV_8U` depth. * `width`,`height` - Image width and height should be greater than or equal to the size of the kernel in the given direction. -* `ksize_width == ksize_height` - kernel size. Only 3x3 and 5x5 kernels are supported. +* `ksize_width == ksize_height` - kernel size. Only 3x3, 5x5 and 7x7 kernels are supported. * `border_type` - pixel extrapolation method. Supported [OpenCV border types](https://docs.opencv.org/5.x/d2/de8/group__core__array.html#ga209f2f4869e304c82d07739337eae7c5) are: + `cv::BORDER_REPLICATE` -- GitLab From e60576a30638fd2dd511a3c863b74c169bd5b5e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Tue, 28 May 2024 12:36:02 +0200 Subject: [PATCH 9/9] Update CHANGELOG.md and doc/functionality.md --- CHANGELOG.md | 2 ++ doc/functionality.md | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c04ee50f..240a62e8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ This changelog aims to follow the guiding principles of ### Added - Exponential function for float. +- Gaussian Blur for 7x7 kernels. + ### Fixed ### Changed diff --git a/doc/functionality.md b/doc/functionality.md index 0edc0259a..874f5442d 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -57,12 +57,12 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Transpose | x | x | x | x | ## Image filters -| | u8 | -|-----------------------|-----| -| Erode | x | -| Dilate | x | -| Sobel | x | -| Gaussian Blur | x | +| | u8 | +|-------------------------------|-----| +| Erode | x | +| Dilate | x | +| Sobel (3x3) | x | +| Gaussian Blur (3x3, 5x5, 7x7) | x | ## Resize with linear interpolation | | u8 | f32 | -- GitLab