From b0943d4c3d9d6f9093cdf2dd981ea3010571c68a Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Mon, 7 Apr 2025 08:11:27 +0000 Subject: [PATCH 1/8] Avoid LSRT hazard in SSVE version of Gaussian blur, Sobel and SepFilter2D - 3x3 --- .../filters/separable_filter_3x3_sc.h | 9 +++--- kleidicv/src/filters/gaussian_blur_sc.h | 27 ++++++++++++------ kleidicv/src/filters/sobel_sc.h | 28 +++++++++++++------ 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h index 6f624ae1c..3988ad533 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h @@ -142,11 +142,10 @@ class SeparableFilter { Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); + const BufferType* src0 = &src_rows.at(0, border_offsets.c0())[index]; + const BufferType* src1 = &src_rows.at(0, border_offsets.c1())[index]; + const BufferType* src2 = &src_rows.at(0, border_offsets.c2())[index]; + filter_.horizontal_scalar_path(src0, src1, src2, &dst_rows[index]); } FilterType filter_; diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 87dbb051b..0fd883ecc 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -74,10 +74,15 @@ class GaussianBlur { // Applies horizontal filtering vector using scalar operations. // // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - auto acc = src[0] + 2 * src[1] + src[2]; - dst[0] = rounding_shift_right(acc, 4); + void horizontal_scalar_path( + const BufferType *p_src_0, const BufferType *p_src_1, + const BufferType *p_src_2, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg16_1 = svptrue_pat_b16(SV_VL1); + svuint16_t src_0 = svld1(pg16_1, p_src_0); + svuint16_t src_1 = svld1(pg16_1, p_src_1); + svuint16_t src_2 = svld1(pg16_1, p_src_2); + horizontal_vector_path(pg16_1, src_0, src_1, src_2, dst); } }; // end of class GaussianBlur @@ -493,11 +498,15 @@ class GaussianBlur final svst1b_u32(pg, &dst[0], acc); } - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); + void horizontal_scalar_path( + const BufferType *p_src_0, const BufferType *p_src_1, + const BufferType *p_src_2, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg32_1 = svptrue_pat_b32(SV_VL1); + svuint32_t src_0 = svld1(pg32_1, p_src_0); + svuint32_t src_1 = svld1(pg32_1, p_src_1); + svuint32_t src_2 = svld1(pg32_1, p_src_2); + horizontal_vector_path(pg32_1, src_0, src_1, src_2, dst); } }; // end of class GaussianBlur diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index be3167d4f..6630d8675 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -59,10 +59,15 @@ class HorizontalSobel3x3 { // Applies horizontal filtering vector using scalar operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - // Explicitly narrow. Overflow is permitted. - dst[0] = static_cast(src[2] - src[0]); + void horizontal_scalar_path( + const BufferType *p_src_0, const BufferType *p_src_1, + const BufferType *p_src_2, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg16_1 = svptrue_pat_b16(SV_VL1); + svint16_t src_0 = svld1(pg16_1, p_src_0); + svint16_t src_1 = svld1(pg16_1, p_src_1); + svint16_t src_2 = svld1(pg16_1, p_src_2); + horizontal_vector_path(pg16_1, src_0, src_1, src_2, dst); } }; // end of class HorizontalSobel3x3 @@ -112,12 +117,17 @@ class VerticalSobel3x3 { // Applies horizontal filtering vector using scalar operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - // Explicitly narrow. Overflow is permitted. - dst[0] = static_cast(src[0] + 2 * src[1] + src[2]); + void horizontal_scalar_path( + const BufferType *p_src_0, const BufferType *p_src_1, + const BufferType *p_src_2, + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg16_1 = svptrue_pat_b16(SV_VL1); + svint16_t src_0 = svld1(pg16_1, p_src_0); + svint16_t src_1 = svld1(pg16_1, p_src_1); + svint16_t src_2 = svld1(pg16_1, p_src_2); + horizontal_vector_path(pg16_1, src_0, src_1, src_2, dst); } -}; // end of class VerticalSobel3x3 +}; // end of class HorizontalSobel3x3 KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t sobel_3x3_horizontal_stripe_s16_u8_sc( -- GitLab From e10f072097a0eb2b0d9565d9974225769b8e5b67 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Tue, 8 Apr 2025 14:20:16 +0000 Subject: [PATCH 2/8] 3x3 filters for SVE2: Eliminate horizontal scalar paths completely --- .../filters/separable_filter_3x3_sc.h | 6 ++-- kleidicv/include/kleidicv/sve2.h | 24 ++++++++++++++++ kleidicv/src/filters/gaussian_blur_sc.h | 25 ----------------- kleidicv/src/filters/sobel_sc.h | 28 ------------------- 4 files changed, 26 insertions(+), 57 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h index 3988ad533..bcc8a9156 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h @@ -142,10 +142,8 @@ class SeparableFilter { Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - const BufferType* src0 = &src_rows.at(0, border_offsets.c0())[index]; - const BufferType* src1 = &src_rows.at(0, border_offsets.c1())[index]; - const BufferType* src2 = &src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src0, src1, src2, &dst_rows[index]); + svbool_t pg_1 = BufferVecTraits::template svptrue_pat(); + horizontal_vector_path(pg_1, src_rows, dst_rows, border_offsets, index); } FilterType filter_; diff --git a/kleidicv/include/kleidicv/sve2.h b/kleidicv/include/kleidicv/sve2.h index ebbc5d04e..1a25567b4 100644 --- a/kleidicv/include/kleidicv/sve2.h +++ b/kleidicv/include/kleidicv/sve2.h @@ -251,6 +251,30 @@ class VecTraitsBase : public VectorTypes { return svptrue_b64(); } + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b8(pat); + } + + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b16(pat); + } + + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b32(pat); + } + + template + static std::enable_if_t svptrue_pat() + KLEIDICV_STREAMING_COMPATIBLE { + return svptrue_pat_b64(pat); + } + template static std::enable_if_t svwhilelt( IndexType index, IndexType max_index) KLEIDICV_STREAMING_COMPATIBLE { diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 0fd883ecc..d2b25f653 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -70,20 +70,6 @@ class GaussianBlur { svst1b(pg, &dst[0], acc); } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_scalar_path( - const BufferType *p_src_0, const BufferType *p_src_1, - const BufferType *p_src_2, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg16_1 = svptrue_pat_b16(SV_VL1); - svuint16_t src_0 = svld1(pg16_1, p_src_0); - svuint16_t src_1 = svld1(pg16_1, p_src_1); - svuint16_t src_2 = svld1(pg16_1, p_src_2); - horizontal_vector_path(pg16_1, src_0, src_1, src_2, dst); - } }; // end of class GaussianBlur // Template for 5x5 Gaussian Blur binomial filters. @@ -497,17 +483,6 @@ class GaussianBlur final acc = svrshr_n_u32_x(pg, acc, 16); svst1b_u32(pg, &dst[0], acc); } - - void horizontal_scalar_path( - const BufferType *p_src_0, const BufferType *p_src_1, - const BufferType *p_src_2, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg32_1 = svptrue_pat_b32(SV_VL1); - svuint32_t src_0 = svld1(pg32_1, p_src_0); - svuint32_t src_1 = svld1(pg32_1, p_src_1); - svuint32_t src_2 = svld1(pg32_1, p_src_2); - horizontal_vector_path(pg32_1, src_0, src_1, src_2, dst); - } }; // end of class GaussianBlur template <> diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 6630d8675..9bb005404 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -55,20 +55,6 @@ class HorizontalSobel3x3 { DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { svst1(pg, &dst[0], svsub_x(pg, src_2, src_0)); } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T - void horizontal_scalar_path( - const BufferType *p_src_0, const BufferType *p_src_1, - const BufferType *p_src_2, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg16_1 = svptrue_pat_b16(SV_VL1); - svint16_t src_0 = svld1(pg16_1, p_src_0); - svint16_t src_1 = svld1(pg16_1, p_src_1); - svint16_t src_2 = svld1(pg16_1, p_src_2); - horizontal_vector_path(pg16_1, src_0, src_1, src_2, dst); - } }; // end of class HorizontalSobel3x3 // Template for 3x3 Sobel filters which calculate vertical derivative @@ -113,20 +99,6 @@ class VerticalSobel3x3 { acc = svmad_s16_x(pg, src_1, svdup_n_s16(2), acc); svst1(pg, &dst[0], acc); } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_scalar_path( - const BufferType *p_src_0, const BufferType *p_src_1, - const BufferType *p_src_2, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg16_1 = svptrue_pat_b16(SV_VL1); - svint16_t src_0 = svld1(pg16_1, p_src_0); - svint16_t src_1 = svld1(pg16_1, p_src_1); - svint16_t src_2 = svld1(pg16_1, p_src_2); - horizontal_vector_path(pg16_1, src_0, src_1, src_2, dst); - } }; // end of class HorizontalSobel3x3 KLEIDICV_TARGET_FN_ATTRS -- GitLab From 02b328ca661e1681dd57a76e76a02e9f99daea32 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 9 Apr 2025 08:07:05 +0000 Subject: [PATCH 3/8] Avoid LSRT hazard in SSVE version of Gaussian blur, Sobel and SepFilter2D - kernels of 5, 7 and 15 --- .../filters/separable_filter_15x15_sc.h | 19 +-- .../filters/separable_filter_5x5_sc.h | 9 +- .../filters/separable_filter_7x7_sc.h | 11 +- kleidicv/src/filters/gaussian_blur_sc.h | 64 ---------- kleidicv/src/filters/separable_filter_2d_sc.h | 112 +++--------------- 5 files changed, 25 insertions(+), 190 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index f95067a09..4745aca8c 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -233,23 +233,8 @@ class SeparableFilter { Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - src[7] = src_rows.at(0, border_offsets.c7())[index]; - src[8] = src_rows.at(0, border_offsets.c8())[index]; - src[9] = src_rows.at(0, border_offsets.c9())[index]; - src[10] = src_rows.at(0, border_offsets.c10())[index]; - src[11] = src_rows.at(0, border_offsets.c11())[index]; - src[12] = src_rows.at(0, border_offsets.c12())[index]; - src[13] = src_rows.at(0, border_offsets.c13())[index]; - src[14] = src_rows.at(0, border_offsets.c14())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); + svbool_t pg_1 = BufferVecTraits::template svptrue_pat(); + horizontal_vector_path(pg_1, src_rows, dst_rows, border_offsets, index); } FilterType filter_; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h index 909e8ce18..22f604bf0 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h @@ -158,13 +158,8 @@ class SeparableFilter { Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); + svbool_t pg_1 = BufferVecTraits::template svptrue_pat(); + horizontal_vector_path(pg_1, src_rows, dst_rows, border_offsets, index); } FilterType filter_; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h index 33f204a10..b7e1fdc50 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h @@ -172,15 +172,8 @@ class SeparableFilter { Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); + svbool_t pg_1 = BufferVecTraits::template svptrue_pat(); + horizontal_vector_path(pg_1, src_rows, dst_rows, border_offsets, index); } FilterType filter_; diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index d2b25f653..e8da099d5 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -124,15 +124,6 @@ class GaussianBlur { acc = svrshr_x(pg, acc, 8); svst1b(pg, &dst[0], acc); } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2]; - dst[0] = rounding_shift_right(acc, 8); - } }; // end of class GaussianBlur // Template for 7x7 Gaussian Blur binomial filters. @@ -237,17 +228,6 @@ class GaussianBlur { svst1b(pg, &dst[0], acc_0_1_2_3_4_5_6_u16); } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * - // * [ 2, 7, 14, 18, 14, 7, 2 ]T - void horizontal_scalar_path(const BufferType src[7], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + - src[4] * 14 + src[5] * 7 + src[6] * 2; - dst[0] = rounding_shift_right(acc, 12); - } }; // end of class GaussianBlur // Template for 15x15 Gaussian Blur binomial filters. @@ -403,20 +383,6 @@ class GaussianBlur { acc = svrshr_n_u32_x(pg, acc, 20); svst1b_u32(pg, &dst[0], acc); } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = (static_cast(src[3]) + src[11]) * 4; - acc += (acc + src[1] + src[13]) * 11; - acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + - (src[4] + src[10]) * 81; - acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; - dst[0] = rounding_shift_right(acc, 20); - } }; // end of class GaussianBlur template @@ -549,14 +515,6 @@ class GaussianBlur final acc = svrshr_n_u32_x(pg, acc, 16); svst1b_u32(pg, &dst[0], acc); } - - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[2] + src[3] * half_kernel_[1] + - src[4] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } }; // end of class GaussianBlur template <> @@ -636,15 +594,6 @@ class GaussianBlur final acc = svrshr_n_u32_x(pg, acc, 16); svst1b_u32(pg, &dst[0], acc); } - - void horizontal_scalar_path(const BufferType src[7], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[2] + src[3] * half_kernel_[3] + - src[4] * half_kernel_[2] + src[5] * half_kernel_[1] + - src[6] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } }; // end of class GaussianBlur template <> @@ -781,19 +730,6 @@ class GaussianBlur final acc = svrshr_n_u32_x(pg, acc, 16); svst1b_u32(pg, &dst[0], acc); } - - void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[2] + src[3] * half_kernel_[3] + - src[4] * half_kernel_[4] + src[5] * half_kernel_[5] + - src[6] * half_kernel_[6] + src[7] * half_kernel_[7] + - src[8] * half_kernel_[6] + src[9] * half_kernel_[5] + - src[10] * half_kernel_[4] + src[11] * half_kernel_[3] + - src[12] * half_kernel_[2] + src[13] * half_kernel_[1] + - src[14] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } }; // end of class GaussianBlur template diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index a98a3f8ec..5b426cda4 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -28,14 +28,12 @@ class SeparableFilter2D { using DestinationType = uint8_t; SeparableFilter2D( - const SourceType *kernel_x, BufferVectorType &kernel_x_0_u16, - BufferVectorType &kernel_x_1_u16, BufferVectorType &kernel_x_2_u16, - BufferVectorType &kernel_x_3_u16, BufferVectorType &kernel_x_4_u16, - SourceVectorType &kernel_y_0_u8, SourceVectorType &kernel_y_1_u8, - SourceVectorType &kernel_y_2_u8, SourceVectorType &kernel_y_3_u8, - SourceVectorType &kernel_y_4_u8) - : kernel_x_(kernel_x), - kernel_x_0_u16_(kernel_x_0_u16), + BufferVectorType &kernel_x_0_u16, BufferVectorType &kernel_x_1_u16, + BufferVectorType &kernel_x_2_u16, BufferVectorType &kernel_x_3_u16, + BufferVectorType &kernel_x_4_u16, SourceVectorType &kernel_y_0_u8, + SourceVectorType &kernel_y_1_u8, SourceVectorType &kernel_y_2_u8, + SourceVectorType &kernel_y_3_u8, SourceVectorType &kernel_y_4_u8) + : kernel_x_0_u16_(kernel_x_0_u16), kernel_x_1_u16_(kernel_x_1_u16), kernel_x_2_u16_(kernel_x_2_u16), kernel_x_3_u16_(kernel_x_3_u16), @@ -118,32 +116,7 @@ class SeparableFilter2D { svst1b_u16(pg, &dst[0], acc_u16); } - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - SourceType acc; // NOLINT - if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { - dst[0] = std::numeric_limits::max(); - return; - } - - for (size_t i = 1; i < 5; i++) { - SourceType temp; // NOLINT - if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { - dst[0] = std::numeric_limits::max(); - return; - } - if (__builtin_add_overflow(acc, temp, &acc)) { - dst[0] = std::numeric_limits::max(); - return; - } - } - - dst[0] = acc; - } - private: - const SourceType *kernel_x_; - BufferVectorType &kernel_x_0_u16_; BufferVectorType &kernel_x_1_u16_; BufferVectorType &kernel_x_2_u16_; @@ -168,14 +141,12 @@ class SeparableFilter2D { using DestinationType = uint16_t; SeparableFilter2D( - const SourceType *kernel_x, BufferVectorType &kernel_x_0_u32, - BufferVectorType &kernel_x_1_u32, BufferVectorType &kernel_x_2_u32, - BufferVectorType &kernel_x_3_u32, BufferVectorType &kernel_x_4_u32, - SourceVectorType &kernel_y_0_u16, SourceVectorType &kernel_y_1_u16, - SourceVectorType &kernel_y_2_u16, SourceVectorType &kernel_y_3_u16, - SourceVectorType &kernel_y_4_u16) - : kernel_x_(kernel_x), - kernel_x_0_u32_(kernel_x_0_u32), + BufferVectorType &kernel_x_0_u32, BufferVectorType &kernel_x_1_u32, + BufferVectorType &kernel_x_2_u32, BufferVectorType &kernel_x_3_u32, + BufferVectorType &kernel_x_4_u32, SourceVectorType &kernel_y_0_u16, + SourceVectorType &kernel_y_1_u16, SourceVectorType &kernel_y_2_u16, + SourceVectorType &kernel_y_3_u16, SourceVectorType &kernel_y_4_u16) + : kernel_x_0_u32_(kernel_x_0_u32), kernel_x_1_u32_(kernel_x_1_u32), kernel_x_2_u32_(kernel_x_2_u32), kernel_x_3_u32_(kernel_x_3_u32), @@ -258,32 +229,7 @@ class SeparableFilter2D { svst1h_u32(pg, &dst[0], acc_u32); } - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT - if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { - dst[0] = std::numeric_limits::max(); - return; - } - - for (size_t i = 1; i < 5; i++) { - SourceType temp; // Avoid cppcoreguidelines-init-variables. NOLINT - if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { - dst[0] = std::numeric_limits::max(); - return; - } - if (__builtin_add_overflow(acc, temp, &acc)) { - dst[0] = std::numeric_limits::max(); - return; - } - } - - dst[0] = acc; - } - private: - const SourceType *kernel_x_; - BufferVectorType &kernel_x_0_u32_; BufferVectorType &kernel_x_1_u32_; BufferVectorType &kernel_x_2_u32_; @@ -308,14 +254,12 @@ class SeparableFilter2D { using DestinationType = int16_t; SeparableFilter2D( - const SourceType *kernel_x, BufferVectorType &kernel_x_0_s32, - BufferVectorType &kernel_x_1_s32, BufferVectorType &kernel_x_2_s32, - BufferVectorType &kernel_x_3_s32, BufferVectorType &kernel_x_4_s32, - SourceVectorType &kernel_y_0_s16, SourceVectorType &kernel_y_1_s16, - SourceVectorType &kernel_y_2_s16, SourceVectorType &kernel_y_3_s16, - SourceVectorType &kernel_y_4_s16) - : kernel_x_(kernel_x), - kernel_x_0_s32_(kernel_x_0_s32), + BufferVectorType &kernel_x_0_s32, BufferVectorType &kernel_x_1_s32, + BufferVectorType &kernel_x_2_s32, BufferVectorType &kernel_x_3_s32, + BufferVectorType &kernel_x_4_s32, SourceVectorType &kernel_y_0_s16, + SourceVectorType &kernel_y_1_s16, SourceVectorType &kernel_y_2_s16, + SourceVectorType &kernel_y_3_s16, SourceVectorType &kernel_y_4_s16) + : kernel_x_0_s32_(kernel_x_0_s32), kernel_x_1_s32_(kernel_x_1_s32), kernel_x_2_s32_(kernel_x_2_s32), kernel_x_3_s32_(kernel_x_3_s32), @@ -403,25 +347,7 @@ class SeparableFilter2D { svst1h_s32(pg, &dst[0], acc_s32); } - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - int64_t acc = static_cast(src[0]) * kernel_x_[0]; - for (size_t i = 1; i < 5; i++) { - acc += static_cast(src[i]) * kernel_x_[i]; - } - - if (acc < std::numeric_limits::min()) { - acc = std::numeric_limits::min(); - } else if (acc > std::numeric_limits::max()) { - acc = std::numeric_limits::max(); - } - - dst[0] = static_cast(acc); - } - private: - const SourceType *kernel_x_; - BufferVectorType &kernel_x_0_s32_; BufferVectorType &kernel_x_1_s32_; BufferVectorType &kernel_x_2_s32_; @@ -501,7 +427,7 @@ kleidicv_error_t separable_filter_2d_stripe_sc( KernelYVectorT kernel_y_4 = KernelYVectorTraits::svdup(kernel_y[4]); SeparableFilterClass filterClass{ - kernel_x, kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_x_4, + kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_x_4, kernel_y_0, kernel_y_1, kernel_y_2, kernel_y_3, kernel_y_4}; SeparableFilter filter{filterClass}; -- GitLab From 21744aeb026fef705e244f6cec452a5b626db169 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Tue, 15 Apr 2025 15:09:38 +0000 Subject: [PATCH 4/8] WIP Prototype: process 15x15 borders vectorized --- .../filters/separable_filter_15x15_sc.h | 102 +++++++++++++++++- .../include/kleidicv/workspace/separable.h | 47 ++++++-- 2 files changed, 137 insertions(+), 12 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index 4745aca8c..28a75eec8 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -1,10 +1,12 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 #ifndef KLEIDICV_SEPARABLE_FILTER_15X15_SC_H #define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H +#include + #include "kleidicv/sve2.h" #include "kleidicv/workspace/border_15x15.h" @@ -78,6 +80,104 @@ class SeparableFilter { }); } + template + class CopyOperation final : public UnrollTwice { + public: + using ContextType = Context; + using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; + using VectorType = typename VecTraits::VectorType; + + VectorType vector_path(ContextType, + VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + return src; + } + }; // end of class CopyOperation + + template + void copy_data_sve2(Rows src_rows, Rows dst_rows, + size_t length) const KLEIDICV_STREAMING_COMPATIBLE { + // 'apply_operation_by_rows' can only handle one channel well + // so width must be multiplied in order to copy all the data + Rectangle rect{length * dst_rows.channels(), std::size_t{1}}; + Rows src_1ch{&src_rows[0], src_rows.stride(), 1}; + Rows dst_1ch{&dst_rows[0], dst_rows.stride(), 1}; + CopyOperation op{}; + apply_operation_by_rows(op, rect, src_1ch, dst_1ch); + } + + size_t process_left_border(Rows src_rows, + Rows dst_rows, + BorderInfoType horizontal_border, + size_t width) const KLEIDICV_STREAMING_COMPATIBLE { + // Process vectors, as many times as needed to pass all + // the borders. Plus, because of the horizontal path, the algorithm needs + // additional pixels. + const size_t block_len = src_rows.channels() * BufferVecTraits::num_lanes(); + const size_t border_len = src_rows.channels() * margin; + const size_t process_len = + ((border_len + block_len - 1) / block_len) * block_len; + const size_t buffer_len = process_len + src_rows.channels() * (15 - 1); + if (buffer_len - margin >= width) { // would it be too long? + return 0; + } + std::unique_ptr left_pixels{new BufferType[buffer_len]}; + BorderOffsets offsets = horizontal_border.offsets_with_left_border(0); + svbool_t pg_ch{}; + switch (src_rows.channels()) { + case 1: + pg_ch = BufferVecTraits::template svptrue_pat(); + break; + case 2: + pg_ch = BufferVecTraits::template svptrue_pat(); + break; + case 3: + pg_ch = BufferVecTraits::template svptrue_pat(); + break; + case 4: + pg_ch = BufferVecTraits::template svptrue_pat(); + break; + default: + break; + } + + Rows buffer_rows{left_pixels.get(), 0, src_rows.channels()}; + Columns buffer_cols = buffer_rows.as_columns(); + Columns src_cols = src_rows.as_columns(); + + // Copy the 7 border pixels (=margin) into the buffer + { + BufferVectorType pixel0 = svld1(pg_ch, src_cols.ptr_at(offsets.c0())); + svst1(pg_ch, buffer_cols.ptr_at(0), pixel0); + BufferVectorType pixel1 = svld1(pg_ch, src_cols.ptr_at(offsets.c1())); + svst1(pg_ch, buffer_cols.ptr_at(1), pixel1); + BufferVectorType pixel2 = svld1(pg_ch, src_cols.ptr_at(offsets.c2())); + svst1(pg_ch, buffer_cols.ptr_at(2), pixel2); + BufferVectorType pixel3 = svld1(pg_ch, src_cols.ptr_at(offsets.c3())); + svst1(pg_ch, buffer_cols.ptr_at(3), pixel3); + BufferVectorType pixel4 = svld1(pg_ch, src_cols.ptr_at(offsets.c4())); + svst1(pg_ch, buffer_cols.ptr_at(4), pixel4); + BufferVectorType pixel5 = svld1(pg_ch, src_cols.ptr_at(offsets.c5())); + svst1(pg_ch, buffer_cols.ptr_at(5), pixel5); + BufferVectorType pixel6 = svld1(pg_ch, src_cols.ptr_at(offsets.c6())); + svst1(pg_ch, buffer_cols.ptr_at(6), pixel6); + } + + // Copy the rest of the buffer using SME copy + copy_data_sve2(src_rows, buffer_rows.at(0, margin), + buffer_len / src_rows.channels() - margin); + + // Do the gaussian blur + offsets = horizontal_border.offsets_without_border(); + for (size_t index = 0; index < process_len; + index += BufferVecTraits::num_lanes()) { + horizontal_vector_path(BufferVecTraits::svptrue(), + buffer_rows.at(0, margin), dst_rows, offsets, + index); + } + + return process_len; + } + // Processing of horizontal borders is always scalar because border offsets // change for each and every element in the border. void process_horizontal_borders( diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 8e3034731..63baf247a 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -8,6 +8,7 @@ #include #include +#include "border_15x15.h" #include "border_types.h" #include "kleidicv/kleidicv.h" #include "kleidicv/types.h" @@ -68,6 +69,21 @@ class SeparableFilterWorkspaceDeleter { // // Handling of borders is calculated based on offsets rather than setting up // suitably-sized buffers which could hold both borders and data. + +template +struct has_process_left_border : std::false_type {}; + +extern const Rows _dummy_src; +extern const Rows _dummy_dst; +extern const FixedBorderInfo15x15 _dummy_border; +extern const size_t _dummy_width; + +template +struct has_process_left_border< + F, std::void_t().process_left_border( + _dummy_src, _dummy_dst, _dummy_border, _dummy_width))>> + : std::true_type {}; + class SeparableFilterWorkspace { public: // To avoid load/store penalties. @@ -166,24 +182,33 @@ class SeparableFilterWorkspace { // Margin associated with the filter. constexpr size_t margin = filter.margin; + size_t processed = 0; + // Process data affected by left border. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t horizontal_index = 0; horizontal_index < margin; - ++horizontal_index) { - auto offsets = - horizontal_border.offsets_with_left_border(horizontal_index); - filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), - dst_rows.at(0, horizontal_index), - offsets); + if constexpr (has_process_left_border::value) { + processed = filter.process_left_border(buffer_rows, dst_rows, + horizontal_border, width); + } + if (processed == 0) { + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + auto offsets = + horizontal_border.offsets_with_left_border(horizontal_index); + filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), + dst_rows.at(0, horizontal_index), + offsets); + } + processed = margin; } // Process data which is not affected by any borders in bulk. { - size_t width_without_borders = width - (2 * margin); + size_t width_without_borders = width - margin - processed; auto offsets = horizontal_border.offsets_without_border(); filter.process_horizontal(width_without_borders, - buffer_rows.at(0, margin), - dst_rows.at(0, margin), offsets); + buffer_rows.at(0, processed), + dst_rows.at(0, processed), offsets); } // Process data affected by right border. -- GitLab From af125e069a77e268955a1b17d33b11f352dfe3fa Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Thu, 24 Apr 2025 11:46:09 +0000 Subject: [PATCH 5/8] WIP vectorized borders 2 --- .../filters/separable_filter_15x15_sc.h | 162 +++++++++++------- .../filters/separable_filter_3x3_sc.h | 4 +- .../filters/separable_filter_5x5_sc.h | 4 +- .../filters/separable_filter_7x7_sc.h | 4 +- kleidicv/src/filters/gaussian_blur_sc.h | 4 +- kleidicv/src/filters/separable_filter_2d_sc.h | 5 +- kleidicv/src/filters/sobel_sc.h | 15 +- 7 files changed, 126 insertions(+), 72 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index 28a75eec8..3e699ea76 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -5,6 +5,8 @@ #ifndef KLEIDICV_SEPARABLE_FILTER_15X15_SC_H #define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H +#include + #include #include "kleidicv/sve2.h" @@ -35,8 +37,24 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} + explicit SeparableFilter(FilterType filter, svuint32_t& t1, svuint32_t& t2, + svuint32_t& t3, + svuint32_t& t4) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter}, + t1_{t1}, + t2_{t2}, + t3_{t3}, + t4_{t4} { + uint32_t kTblPair[16] = {0, 16}; + uint32_t kTblPair2[16] = {0, 1, 16, 17}; + uint32_t kTblPair4[16] = {0, 1, 2, 3, 16, 17, 18, 19}; + uint32_t kTblPair7[16] = {0, 1, 2, 3, 4, 5, 6, 16, + 17, 18, 19, 20, 21, 22, 23, 24}; + t1_ = svld1(svptrue_b32(), kTblPair); + t2_ = svld1(svptrue_b32(), kTblPair2); + t3_ = svld1(svptrue_b32(), kTblPair4); + t4_ = svld1(svptrue_b32(), kTblPair7); + } static constexpr size_t margin = 7UL; @@ -80,99 +98,112 @@ class SeparableFilter { }); } - template - class CopyOperation final : public UnrollTwice { - public: - using ContextType = Context; - using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - using VectorType = typename VecTraits::VectorType; - - VectorType vector_path(ContextType, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { - return src; - } - }; // end of class CopyOperation - - template - void copy_data_sve2(Rows src_rows, Rows dst_rows, - size_t length) const KLEIDICV_STREAMING_COMPATIBLE { - // 'apply_operation_by_rows' can only handle one channel well - // so width must be multiplied in order to copy all the data - Rectangle rect{length * dst_rows.channels(), std::size_t{1}}; - Rows src_1ch{&src_rows[0], src_rows.stride(), 1}; - Rows dst_1ch{&dst_rows[0], dst_rows.stride(), 1}; - CopyOperation op{}; - apply_operation_by_rows(op, rect, src_1ch, dst_1ch); - } - size_t process_left_border(Rows src_rows, Rows dst_rows, BorderInfoType horizontal_border, size_t width) const KLEIDICV_STREAMING_COMPATIBLE { - // Process vectors, as many times as needed to pass all - // the borders. Plus, because of the horizontal path, the algorithm needs - // additional pixels. - const size_t block_len = src_rows.channels() * BufferVecTraits::num_lanes(); - const size_t border_len = src_rows.channels() * margin; - const size_t process_len = - ((border_len + block_len - 1) / block_len) * block_len; - const size_t buffer_len = process_len + src_rows.channels() * (15 - 1); - if (buffer_len - margin >= width) { // would it be too long? - return 0; - } - std::unique_ptr left_pixels{new BufferType[buffer_len]}; - BorderOffsets offsets = horizontal_border.offsets_with_left_border(0); - svbool_t pg_ch{}; switch (src_rows.channels()) { case 1: - pg_ch = BufferVecTraits::template svptrue_pat(); + return process_left_border<1UL>( + src_rows, dst_rows, horizontal_border, width, + BufferVecTraits::template svptrue_pat()); break; case 2: - pg_ch = BufferVecTraits::template svptrue_pat(); + return process_left_border<2UL>( + src_rows, dst_rows, horizontal_border, width, + BufferVecTraits::template svptrue_pat()); break; case 3: - pg_ch = BufferVecTraits::template svptrue_pat(); + return process_left_border<3UL>( + src_rows, dst_rows, horizontal_border, width, + BufferVecTraits::template svptrue_pat()); break; case 4: - pg_ch = BufferVecTraits::template svptrue_pat(); + return process_left_border<4UL>( + src_rows, dst_rows, horizontal_border, width, + BufferVecTraits::template svptrue_pat()); break; default: break; } + return 0; + } + + template + size_t process_left_border( + Rows src_rows, Rows dst_rows, + BorderInfoType horizontal_border, size_t width, + svbool_t pg_ch) const KLEIDICV_STREAMING_COMPATIBLE { + // Process vectors, as many times as needed to pass + // all the borders. Plus, because of the horizontal path, the algorithm + // needs additional pixels. + const size_t block_len = Channels * BufferVecTraits::num_lanes(); + const size_t border_len = Channels * margin; + const size_t process_len = + ((border_len + block_len - 1) / block_len) * block_len; + const size_t buffer_len = process_len + Channels * (15 - 1); + if (buffer_len - margin >= width) { // would it be too long? + return 0; + } + // PROTO: now this is pretty much fixed + // only implemented for 512-bit vector length + if (svcntw() != 16) { + return 0; + } + // With 4 channels, buffer_len is 14*4 + 16 = 72, that needs 5 vectors (80) + BufferVectorType vbuf0, vbuf1; //, vbuf2, vbuf3, vbuf4; + // BufferVectorType* pvbuf[5] = {&vbuf0, &vbuf1, &vbuf2, &vbuf3, &vbuf4}; + + BorderOffsets offsets = horizontal_border.offsets_with_left_border(0); - Rows buffer_rows{left_pixels.get(), 0, src_rows.channels()}; - Columns buffer_cols = buffer_rows.as_columns(); Columns src_cols = src_rows.as_columns(); // Copy the 7 border pixels (=margin) into the buffer { BufferVectorType pixel0 = svld1(pg_ch, src_cols.ptr_at(offsets.c0())); - svst1(pg_ch, buffer_cols.ptr_at(0), pixel0); BufferVectorType pixel1 = svld1(pg_ch, src_cols.ptr_at(offsets.c1())); - svst1(pg_ch, buffer_cols.ptr_at(1), pixel1); BufferVectorType pixel2 = svld1(pg_ch, src_cols.ptr_at(offsets.c2())); - svst1(pg_ch, buffer_cols.ptr_at(2), pixel2); BufferVectorType pixel3 = svld1(pg_ch, src_cols.ptr_at(offsets.c3())); - svst1(pg_ch, buffer_cols.ptr_at(3), pixel3); BufferVectorType pixel4 = svld1(pg_ch, src_cols.ptr_at(offsets.c4())); - svst1(pg_ch, buffer_cols.ptr_at(4), pixel4); BufferVectorType pixel5 = svld1(pg_ch, src_cols.ptr_at(offsets.c5())); - svst1(pg_ch, buffer_cols.ptr_at(5), pixel5); BufferVectorType pixel6 = svld1(pg_ch, src_cols.ptr_at(offsets.c6())); - svst1(pg_ch, buffer_cols.ptr_at(6), pixel6); + if constexpr (Channels == 1) { + // need to load 14 + 16 = 30 elements, that's two vectors + BufferVectorType px01 = svtbl2_u32(svcreate2(pixel0, pixel1), t1_); + BufferVectorType px23 = svtbl2_u32(svcreate2(pixel2, pixel3), t1_); + BufferVectorType px45 = svtbl2_u32(svcreate2(pixel4, pixel5), t1_); + BufferVectorType px0123 = svtbl2_u32(svcreate2(px01, px23), t2_); + BufferVectorType px456 = svtbl2_u32(svcreate2(px45, pixel6), t2_); + BufferVectorType px0to6 = svtbl2_u32(svcreate2(px0123, px456), t3_); + BufferVectorType image0 = svld1(svwhilelt_b32(7, 16), &src_cols[0]); + vbuf1 = svld1(svwhilelt_b32(0, 14), &src_cols[16 - 7]); + vbuf0 = svtbl2_u32(svcreate2(px0to6, image0), t4_); + } else { + vbuf0 = svld1(svptrue_b32(), &src_cols[0]); + } } - - // Copy the rest of the buffer using SME copy - copy_data_sve2(src_rows, buffer_rows.at(0, margin), - buffer_len / src_rows.channels() - margin); - // Do the gaussian blur - offsets = horizontal_border.offsets_without_border(); - for (size_t index = 0; index < process_len; - index += BufferVecTraits::num_lanes()) { - horizontal_vector_path(BufferVecTraits::svptrue(), - buffer_rows.at(0, margin), dst_rows, offsets, - index); + + if constexpr (Channels == 1) { + BufferVectorType src_0 = vbuf0; + BufferVectorType src_1 = svext(vbuf0, vbuf1, 1); + BufferVectorType src_2 = svext(vbuf0, vbuf1, 2); + BufferVectorType src_3 = svext(vbuf0, vbuf1, 3); + BufferVectorType src_4 = svext(vbuf0, vbuf1, 4); + BufferVectorType src_5 = svext(vbuf0, vbuf1, 5); + BufferVectorType src_6 = svext(vbuf0, vbuf1, 6); + BufferVectorType src_7 = svext(vbuf0, vbuf1, 7); + BufferVectorType src_8 = svext(vbuf0, vbuf1, 8); + BufferVectorType src_9 = svext(vbuf0, vbuf1, 9); + BufferVectorType src_10 = svext(vbuf0, vbuf1, 10); + BufferVectorType src_11 = svext(vbuf0, vbuf1, 11); + BufferVectorType src_12 = svext(vbuf0, vbuf1, 12); + BufferVectorType src_13 = svext(vbuf0, vbuf1, 13); + BufferVectorType src_14 = svext(vbuf0, vbuf1, 14); + filter_.horizontal_vector_path(svptrue_b32(), src_0, src_1, src_2, src_3, + src_4, src_5, src_6, src_7, src_8, src_9, + src_10, src_11, src_12, src_13, src_14, + &dst_rows.as_columns()[0]); } return process_len; @@ -338,6 +369,7 @@ class SeparableFilter { } FilterType filter_; + svuint32_t &t1_, &t2_, &t3_, &t4_; }; // end of class SeparableFilter // Shorthand for 15x15 separable filters driver type. diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h index bcc8a9156..a599389ea 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h @@ -33,7 +33,9 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter, BufferVectorType, + BufferVectorType, BufferVectorType, + BufferVectorType) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} static constexpr size_t margin = 1UL; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h index 22f604bf0..7dda46464 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h @@ -33,7 +33,9 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter, BufferVectorType, + BufferVectorType, BufferVectorType, + BufferVectorType) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} static constexpr size_t margin = 2UL; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h index b7e1fdc50..9f0484f41 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h @@ -33,7 +33,9 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter, BufferVectorType, + BufferVectorType, BufferVectorType, + BufferVectorType) KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} static constexpr size_t margin = 3UL; diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index e8da099d5..e5ee0ee44 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -741,7 +741,9 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; + typename VecTraits::VectorType t1, + t2, t3, t4; + SeparableFilter filter{blur, t1, t2, t3, t4}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 5b426cda4..91b08992c 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -429,7 +429,10 @@ kleidicv_error_t separable_filter_2d_stripe_sc( SeparableFilterClass filterClass{ kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_x_4, kernel_y_0, kernel_y_1, kernel_y_2, kernel_y_3, kernel_y_4}; - SeparableFilter filter{filterClass}; + typename VecTraits::VectorType t1, + t2, t3, t4; + + SeparableFilter filter{filterClass, t1, t2, t3, t4}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 9bb005404..10e385ffa 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -125,7 +125,12 @@ static kleidicv_error_t sobel_3x3_horizontal_stripe_s16_u8_sc( } HorizontalSobel3x3 horizontal_sobel; - SeparableFilter3x3> filter{horizontal_sobel}; + typename VecTraits< + typename HorizontalSobel3x3::BufferType>::VectorType t1, + t2, t3, t4; + + SeparableFilter3x3> filter{horizontal_sobel, t1, + t2, t3, t4}; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -155,7 +160,13 @@ static kleidicv_error_t sobel_3x3_vertical_stripe_s16_u8_sc( } VerticalSobel3x3 vertical_sobel; - SeparableFilter3x3> filter{vertical_sobel}; + typename VecTraits::BufferType>::VectorType + t1, + t2, t3, t4; + + SeparableFilter3x3> filter{vertical_sobel, t1, t2, + t3, t4}; + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; -- GitLab From 9187c1b5768cfa163d4e4bc053b1e1dea040a7d4 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Fri, 25 Apr 2025 05:11:34 +0000 Subject: [PATCH 6/8] WIP border with vectors: reflect only --- .../filters/separable_filter_15x15_sc.h | 147 +++++++++--------- .../include/kleidicv/workspace/separable.h | 44 +++++- 2 files changed, 109 insertions(+), 82 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index 3e699ea76..1ba9483d8 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -45,15 +45,34 @@ class SeparableFilter { t2_{t2}, t3_{t3}, t4_{t4} { - uint32_t kTblPair[16] = {0, 16}; - uint32_t kTblPair2[16] = {0, 1, 16, 17}; - uint32_t kTblPair4[16] = {0, 1, 2, 3, 16, 17, 18, 19}; - uint32_t kTblPair7[16] = {0, 1, 2, 3, 4, 5, 6, 16, - 17, 18, 19, 20, 21, 22, 23, 24}; - t1_ = svld1(svptrue_b32(), kTblPair); - t2_ = svld1(svptrue_b32(), kTblPair2); - t3_ = svld1(svptrue_b32(), kTblPair4); - t4_ = svld1(svptrue_b32(), kTblPair7); + /* + uint32_t kTblPair[16] = {0, 16}; + uint32_t kTblPair2[16] = {0, 1, 16, 17}; + uint32_t kTblPair4[16] = {0, 1, 2, 3, 16, 17, 18, 19}; + uint32_t kTblPair7[16] = {0, 1, 2, 3, 4, 5, 6, 16, + 17, 18, 19, 20, 21, 22, 23, 24}; + t1_ = svld1(svptrue_b32(), kTblPair); + t2_ = svld1(svptrue_b32(), kTblPair2); + t3_ = svld1(svptrue_b32(), kTblPair4); + t4_ = svld1(svptrue_b32(), kTblPair7); + */ + // PROTO: REFLECT ONLY + // case FixedBorderType::REFLECT: + // if (column_index == 0) { + // return get(6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7); + uint32_t kTbl[16] = {6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + t1_ = svld1(svptrue_b32(), kTbl); + + /* this is to replace SVEXTs with SVTBLs + uint32_t kTbl3[16] = {3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18}; + uint32_t kTbl6[16] = {6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21}; + uint32_t kTbl9[16] = {9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24}; + t2_ = svld1(svptrue_b32(), kTbl3); + t3_ = svld1(svptrue_b32(), kTbl6); + t4_ = svld1(svptrue_b32(), kTbl9);*/ } static constexpr size_t margin = 7UL; @@ -98,88 +117,49 @@ class SeparableFilter { }); } - size_t process_left_border(Rows src_rows, - Rows dst_rows, - BorderInfoType horizontal_border, - size_t width) const KLEIDICV_STREAMING_COMPATIBLE { - switch (src_rows.channels()) { - case 1: - return process_left_border<1UL>( - src_rows, dst_rows, horizontal_border, width, - BufferVecTraits::template svptrue_pat()); - break; - case 2: - return process_left_border<2UL>( - src_rows, dst_rows, horizontal_border, width, - BufferVecTraits::template svptrue_pat()); - break; - case 3: - return process_left_border<3UL>( - src_rows, dst_rows, horizontal_border, width, - BufferVecTraits::template svptrue_pat()); - break; - case 4: - return process_left_border<4UL>( - src_rows, dst_rows, horizontal_border, width, - BufferVecTraits::template svptrue_pat()); - break; - default: - break; - } - return 0; - } - template - size_t process_left_border( - Rows src_rows, Rows dst_rows, - BorderInfoType horizontal_border, size_t width, - svbool_t pg_ch) const KLEIDICV_STREAMING_COMPATIBLE { + size_t process_left_border(Rows src_rows, + Rows dst_rows, BorderInfoType, + size_t) const KLEIDICV_STREAMING_COMPATIBLE { // Process vectors, as many times as needed to pass // all the borders. Plus, because of the horizontal path, the algorithm // needs additional pixels. - const size_t block_len = Channels * BufferVecTraits::num_lanes(); - const size_t border_len = Channels * margin; - const size_t process_len = + // This algo only works with 512 bits and 32 bit words --> 16 lanes + constexpr size_t num_lanes = 16; + constexpr size_t block_len = Channels * num_lanes; + constexpr size_t border_len = Channels * margin; + constexpr size_t process_len = ((border_len + block_len - 1) / block_len) * block_len; - const size_t buffer_len = process_len + Channels * (15 - 1); - if (buffer_len - margin >= width) { // would it be too long? - return 0; - } + /* const size_t buffer_len = process_len + Channels * (15 - 1); + if (buffer_len - margin >= width) { // would it be too long? + return 0; + }*/ // PROTO: now this is pretty much fixed // only implemented for 512-bit vector length - if (svcntw() != 16) { - return 0; - } + /* if (svcntw() != 16) { + return 0; + }*/ // With 4 channels, buffer_len is 14*4 + 16 = 72, that needs 5 vectors (80) BufferVectorType vbuf0, vbuf1; //, vbuf2, vbuf3, vbuf4; // BufferVectorType* pvbuf[5] = {&vbuf0, &vbuf1, &vbuf2, &vbuf3, &vbuf4}; - BorderOffsets offsets = horizontal_border.offsets_with_left_border(0); + // BorderOffsets offsets = horizontal_border.offsets_with_left_border(0); + + // PROTO: REFLECT ONLY + // case FixedBorderType::REFLECT: + // if (column_index == 0) { + // return get(6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7); Columns src_cols = src_rows.as_columns(); - // Copy the 7 border pixels (=margin) into the buffer + // Load the border-affected pixels (=margin) into the buffer { - BufferVectorType pixel0 = svld1(pg_ch, src_cols.ptr_at(offsets.c0())); - BufferVectorType pixel1 = svld1(pg_ch, src_cols.ptr_at(offsets.c1())); - BufferVectorType pixel2 = svld1(pg_ch, src_cols.ptr_at(offsets.c2())); - BufferVectorType pixel3 = svld1(pg_ch, src_cols.ptr_at(offsets.c3())); - BufferVectorType pixel4 = svld1(pg_ch, src_cols.ptr_at(offsets.c4())); - BufferVectorType pixel5 = svld1(pg_ch, src_cols.ptr_at(offsets.c5())); - BufferVectorType pixel6 = svld1(pg_ch, src_cols.ptr_at(offsets.c6())); + // need to load 14 + 16 = 30 elements, that's two vectors + // but it's actually only 23 pixels, the first 7 are the same, permuted if constexpr (Channels == 1) { - // need to load 14 + 16 = 30 elements, that's two vectors - BufferVectorType px01 = svtbl2_u32(svcreate2(pixel0, pixel1), t1_); - BufferVectorType px23 = svtbl2_u32(svcreate2(pixel2, pixel3), t1_); - BufferVectorType px45 = svtbl2_u32(svcreate2(pixel4, pixel5), t1_); - BufferVectorType px0123 = svtbl2_u32(svcreate2(px01, px23), t2_); - BufferVectorType px456 = svtbl2_u32(svcreate2(px45, pixel6), t2_); - BufferVectorType px0to6 = svtbl2_u32(svcreate2(px0123, px456), t3_); - BufferVectorType image0 = svld1(svwhilelt_b32(7, 16), &src_cols[0]); - vbuf1 = svld1(svwhilelt_b32(0, 14), &src_cols[16 - 7]); - vbuf0 = svtbl2_u32(svcreate2(px0to6, image0), t4_); - } else { - vbuf0 = svld1(svptrue_b32(), &src_cols[0]); + BufferVectorType image0 = svld1(svptrue_b32(), &src_cols[0]); + vbuf1 = svld1(svwhilelt_b32(9, 23), &src_cols[9]); + vbuf0 = svtbl_u32(image0, t1_); } } // Do the gaussian blur @@ -200,6 +180,23 @@ class SeparableFilter { BufferVectorType src_12 = svext(vbuf0, vbuf1, 12); BufferVectorType src_13 = svext(vbuf0, vbuf1, 13); BufferVectorType src_14 = svext(vbuf0, vbuf1, 14); + /* alternative way, did not help: + BufferVectorType src_0 = vbuf0; + BufferVectorType src_1 = svext(vbuf0, vbuf1, 1); + BufferVectorType src_2 = svext(vbuf0, vbuf1, 2); + BufferVectorType src_3 = svtbl2(svcreate2(vbuf0, vbuf1), t2_); + BufferVectorType src_4 = svext(vbuf0, vbuf1, 4); + BufferVectorType src_5 = svext(vbuf0, vbuf1, 5); + BufferVectorType src_6 = svtbl2(svcreate2(vbuf0, vbuf1), t3_); + BufferVectorType src_7 = svext(vbuf0, vbuf1, 7); + BufferVectorType src_8 = svext(vbuf0, vbuf1, 8); + BufferVectorType src_9 = svtbl2(svcreate2(vbuf0, vbuf1), t4_); + BufferVectorType src_10 = svext(vbuf0, vbuf1, 10); + BufferVectorType src_11 = svext(vbuf0, vbuf1, 11); + BufferVectorType src_12 = svext(vbuf0, vbuf1, 12); + BufferVectorType src_13 = svext(vbuf0, vbuf1, 13); + BufferVectorType src_14 = svext(vbuf0, vbuf1, 14); + */ filter_.horizontal_vector_path(svptrue_b32(), src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, src_9, src_10, src_11, src_12, src_13, src_14, diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 63baf247a..bcdb5bff9 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -146,6 +146,34 @@ class SeparableFilterWorkspace { Rows dst_rows, size_t channels, typename FilterType::BorderType border_type, FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + switch (channels) { + case 1: + process_template<1>(rect, y_begin, y_end, src_rows, dst_rows, + border_type, filter); + break; + case 2: + process_template<2>(rect, y_begin, y_end, src_rows, dst_rows, + border_type, filter); + break; + case 3: + process_template<3>(rect, y_begin, y_end, src_rows, dst_rows, + border_type, filter); + break; + case 4: + process_template<4>(rect, y_begin, y_end, src_rows, dst_rows, + border_type, filter); + break; + default: + break; + } + } + + template + void process_template(Rectangle rect, size_t y_begin, size_t y_end, + Rows src_rows, + Rows dst_rows, + typename FilterType::BorderType border_type, + FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { // Border helper which calculates border offsets. typename FilterType::BorderInfoType vertical_border{rect.height(), border_type}; @@ -155,7 +183,7 @@ class SeparableFilterWorkspace { // Buffer rows which hold intermediate widened data. auto buffer_rows = Rows{reinterpret_cast( &data_[buffer_rows_offset_]), - buffer_rows_stride_, channels}; + buffer_rows_stride_, Channels}; // Vertical processing loop. for (size_t vertical_index = y_begin; vertical_index < y_end; @@ -166,13 +194,14 @@ class SeparableFilterWorkspace { filter.process_vertical(rect.width(), src_rows.at(vertical_index), buffer_rows, offsets); // Process in the horizontal direction last. - process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), - filter, horizontal_border); + process_horizontal(rect.width(), buffer_rows, + dst_rows.at(vertical_index), filter, + horizontal_border); } } protected: - template + template void process_horizontal(size_t width, Rows buffer_rows, Rows dst_rows, @@ -186,9 +215,10 @@ class SeparableFilterWorkspace { // Process data affected by left border. if constexpr (has_process_left_border::value) { - processed = filter.process_left_border(buffer_rows, dst_rows, - horizontal_border, width); + processed = filter.process_left_border( + buffer_rows, dst_rows, horizontal_border, width); } + /* if (processed == 0) { KLEIDICV_FORCE_LOOP_UNROLL for (size_t horizontal_index = 0; horizontal_index < margin; @@ -200,7 +230,7 @@ class SeparableFilterWorkspace { offsets); } processed = margin; - } + }*/ // Process data which is not affected by any borders in bulk. { -- GitLab From 0d75d2812ef7b0bc1284f79d7cb3314bd8c97668 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Mon, 28 Apr 2025 15:55:08 +0000 Subject: [PATCH 7/8] WIP Disable GaussianBlur tests except 15x15 1channel --- conformity/opencv/test_gaussian_blur.cpp | 14 ++++++---- conformity/opencv/tests.cpp | 34 ++++++++++++------------ 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index d40bddd67..b0214b92a 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -28,14 +28,15 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, cv::RNG rng(0); size_t size_min = 5; - size_t size_max = 16; + size_t width_max = 16; + size_t height_max = 16; if constexpr (KernelSize == 15) { size_min = 14; - size_max = 32; + width_max = 120; } - for (size_t y = size_min; y <= size_max; ++y) { - for (size_t x = size_min; x <= size_max; ++x) { + for (size_t y = size_min; y <= height_max; y += 3) { + for (size_t x = size_min; x <= width_max; x += 5) { // Two extra lines allocated to be sure sigma can be placed next to the // real input cv::Mat input(y + 2, x, CV_8UC(Channels)); @@ -79,6 +80,7 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, std::vector& gaussian_blur_tests_get() { // clang-format off static std::vector tests = { + /* TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), @@ -163,8 +165,9 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - +*/ TEST("Gaussian blur 15x15, BORDER_REFLECT, 1 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), +/* TEST("Gaussian blur 15x15, BORDER_REFLECT, 2 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), TEST("Gaussian blur 15x15, BORDER_REFLECT, 3 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), TEST("Gaussian blur 15x15, BORDER_REFLECT, 4 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), @@ -178,6 +181,7 @@ std::vector& gaussian_blur_tests_get() { TEST("Gaussian blur 15x15, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 15x15, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), TEST("Gaussian blur 15x15, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + */ }; // clang-format on return tests; diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp index 614b66cb1..c54e129d5 100644 --- a/conformity/opencv/tests.cpp +++ b/conformity/opencv/tests.cpp @@ -23,27 +23,27 @@ static std::vector merge_tests( } std::vector all_tests = merge_tests({ - binary_op_tests_get, - cvtcolor_tests_get, - morphology_tests_get, +// binary_op_tests_get, +// cvtcolor_tests_get, +// morphology_tests_get, #if KLEIDICV_ENABLE_ALL_OPENCV_HAL separable_filter_2d_tests_get, #endif gaussian_blur_tests_get, - rgb2yuv_tests_get, - yuv2rgb_tests_get, - sobel_tests_get, - exp_tests_get, - float_conversion_tests_get, - resize_tests_get, - scale_tests_get, - sum_tests_get, - min_max_tests_get, - in_range_tests_get, - remap_tests_get, - warp_perspective_tests_get, - blur_and_downsample_tests_get, - scharr_interleaved_tests_get, + /* rgb2yuv_tests_get, + yuv2rgb_tests_get, + sobel_tests_get, + exp_tests_get, + float_conversion_tests_get, + resize_tests_get, + scale_tests_get, + sum_tests_get, + min_max_tests_get, + in_range_tests_get, + remap_tests_get, + warp_perspective_tests_get, + blur_and_downsample_tests_get, + scharr_interleaved_tests_get,*/ }); #if MANAGER -- GitLab From b4bff042bcd855edfd2eed90d3feb956087cff4d Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Thu, 8 May 2025 10:13:43 +0000 Subject: [PATCH 8/8] Decrease data dependency in GaussianBlur 15x15 --- .../filters/separable_filter_generic_neon.h | 155 ++++++++++++++++++ .../kleidicv/workspace/border_generic.h | 88 ++++++++++ kleidicv/src/filters/gaussian_blur_sc.h | 65 ++++---- 3 files changed, 273 insertions(+), 35 deletions(-) create mode 100644 kleidicv/include/kleidicv/filters/separable_filter_generic_neon.h create mode 100644 kleidicv/include/kleidicv/workspace/border_generic.h diff --git a/kleidicv/include/kleidicv/filters/separable_filter_generic_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_generic_neon.h new file mode 100644 index 000000000..357420bcd --- /dev/null +++ b/kleidicv/include/kleidicv/filters/separable_filter_generic_neon.h @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_GENERIC_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_GENERIC_NEON_H + +#include "kleidicv/config.h" +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_generic.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of generic size kernels separable filters. +template +class SeparableFilterGeneric { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = BorderInfoGeneric; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = 1UL; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(border_offsets.c0())[index]; + auto src_1 = &src_rows.at(border_offsets.c1())[index]; + auto src_2 = &src_rows.at(border_offsets.c2())[index]; + + typename SourceVecTraits::Vector2Type src_0_x2; + SourceVecTraits::load(&src_0[0], src_0_x2); + typename SourceVecTraits::Vector2Type src_1_x2; + SourceVecTraits::load(&src_1[0], src_1_x2); + typename SourceVecTraits::Vector2Type src_2_x2; + SourceVecTraits::load(&src_2[0], src_2_x2); + + SourceVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.vertical_vector_path(src_a, &dst_rows[index]); + filter_.vertical_vector_path( + src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + SourceVectorType src[3]; + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[3]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + + typename BufferVecTraits::Vector2Type src_0_x2; + BufferVecTraits::load(&src_0[0], src_0_x2); + typename BufferVecTraits::Vector2Type src_1_x2; + BufferVecTraits::load(&src_1[0], src_1_x2); + typename BufferVecTraits::Vector2Type src_2_x2; + BufferVecTraits::load(&src_2[0], src_2_x2); + + BufferVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[3]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_GENERIC_NEON_H diff --git a/kleidicv/include/kleidicv/workspace/border_generic.h b/kleidicv/include/kleidicv/workspace/border_generic.h new file mode 100644 index 000000000..47b152ac6 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_generic.h @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_GENERIC_H +#define KLEIDICV_WORKSPACE_BORDER_GENERIC_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Generic border offsets for any size filters. +template +class GenericBorderInfo { + public: + GenericBorderInfo(size_t width) : width_(width), margin_(width / 2) {} + + // Returns offset without the influence of any border. + ptrdiff_t offset_without_border(size_t index) const { + return static_cast(index) - margin_; + } + + // Returns offset for columns affected by left border. + ptrdiff_t offset_with_left_border(size_t base_index, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + // TODO this is the REPLICATE case, implement all cases + return static_cast(base_index) + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE +} + +// Returns offsets for columns affected by right border. +Offsets +offsets_with_right_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(-1, 0, 0); + break; + + case FixedBorderType::WRAP: + return get(-1, 0, 1 - width_); + break; + + case FixedBorderType::REVERSE: + return get(-1, 0, -1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE +} + +// Returns offsets for rows or columns affected by any border. +Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index == 0U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index == (width_ - 1U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); +} + +private: +// Takes care of static signed to unsigned casts. +Offsets get(ptrdiff_t o0, ptrdiff_t o1, ptrdiff_t o2) const { + return Offsets{o0, o1, o2}; +} + +size_t width_; +FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 3x3 filter border type. +template +using FixedBorderInfo3x3 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_GENERIC_H diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index e5ee0ee44..2a363b3bb 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -359,27 +359,27 @@ class GaussianBlur { svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, svuint32_t src_14, DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svuint32_t acc_1_13 = svadd_u32_x(pg, src_1, src_13); - svuint32_t acc_2_12 = svadd_u32_x(pg, src_2, src_12); svuint32_t acc_6_8 = svadd_u32_x(pg, src_6, src_8); + acc_6_8 = svmul_n_u32_x(pg, acc_6_8, 146); svuint32_t acc_5_9 = svadd_u32_x(pg, src_5, src_9); - svuint32_t acc_0_14 = svadd_u32_x(pg, src_0, src_14); svuint32_t acc_3_11 = svadd_u32_x(pg, src_3, src_11); + acc_3_11 = svmul_n_u32_x(pg, acc_3_11, 48); svuint32_t acc_4_10 = svadd_u32_x(pg, src_4, src_10); - + svuint32_t acc_2_12 = svadd_u32_x(pg, src_2, src_12); + acc_2_12 = svmul_n_u32_x(pg, acc_2_12, 25); + svuint32_t acc_1_13 = svadd_u32_x(pg, src_1, src_13); + svuint32_t acc_0_14 = svadd_u32_x(pg, src_0, src_14); acc_0_14 = svlsl_n_u32_x(pg, acc_0_14, 2); - acc_3_11 = svlsl_n_u32_x(pg, acc_3_11, 2); - acc_4_10 = svmul_n_u32_x(pg, acc_4_10, 81); - - svuint32_t acc_1_3_11_13 = svadd_u32_x(pg, acc_3_11, acc_1_13); - acc_1_3_11_13 = svmla_n_u32_x(pg, acc_3_11, acc_1_3_11_13, 11); - svuint32_t acc_0_1_3_11_13_14 = svadd_u32_x(pg, acc_1_3_11_13, acc_0_14); - svuint32_t acc_2_4_10_12 = svmla_n_u32_x(pg, acc_4_10, acc_2_12, 25); - - svuint32_t acc = svadd_u32_x(pg, acc_2_4_10_12, acc_0_1_3_11_13_14); - acc = svmla_n_u32_x(pg, acc, acc_6_8, 146); - acc = svmla_n_u32_x(pg, acc, acc_5_9, 118); - acc = svmla_n_u32_x(pg, acc, src_7, 158); + + svuint32_t acc_6_8_5_9 = svmla_n_u32_x(pg, acc_6_8, acc_5_9, 118); + svuint32_t acc_3_11_4_10 = svmla_n_u32_x(pg, acc_3_11, acc_4_10, 81); + svuint32_t acc_2_12_1_13 = svmla_n_u32_x(pg, acc_2_12, acc_1_13, 11); + svuint32_t acc_7_0_14 = svmla_n_u32_x(pg, acc_0_14, src_7, 158); + + svuint32_t acc1 = svadd_u32_x(pg, acc_6_8_5_9, acc_3_11_4_10); + svuint32_t acc2 = svadd_u32_x(pg, acc_2_12_1_13, acc_7_0_14); + + svuint32_t acc = svadd_u32_x(pg, acc1, acc2); acc = svrshr_n_u32_x(pg, acc, 20); svst1b_u32(pg, &dst[0], acc); } @@ -696,38 +696,33 @@ class GaussianBlur final svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, svuint32_t src_14, DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - // 7 - svuint32_t acc = svmul_n_u32_x(pg, src_7, half_kernel_[7]); - - // 6 - 8 svuint32_t acc_6_8 = svadd_u32_x(pg, src_6, src_8); - acc = svmla_n_u32_x(pg, acc, acc_6_8, half_kernel_[6]); + acc_6_8 = svmul_n_u32_x(pg, acc_6_8, half_kernel_[6]); - // 5 - 9 svuint32_t acc_5_9 = svadd_u32_x(pg, src_5, src_9); - acc = svmla_n_u32_x(pg, acc, acc_5_9, half_kernel_[5]); + acc_5_9 = svmul_n_u32_x(pg, acc_5_9, half_kernel_[5]); - // 4 - 10 svuint32_t acc_4_10 = svadd_u32_x(pg, src_4, src_10); - acc = svmla_n_u32_x(pg, acc, acc_4_10, half_kernel_[4]); + acc_4_10 = svmul_n_u32_x(pg, acc_4_10, half_kernel_[4]); - // 3 - 11 svuint32_t acc_3_11 = svadd_u32_x(pg, src_3, src_11); - acc = svmla_n_u32_x(pg, acc, acc_3_11, half_kernel_[3]); + acc_3_11 = svmul_n_u32_x(pg, acc_3_11, half_kernel_[3]); - // 2 - 12 svuint32_t acc_2_12 = svadd_u32_x(pg, src_2, src_12); - acc = svmla_n_u32_x(pg, acc, acc_2_12, half_kernel_[2]); + svuint32_t acc1 = svmla_n_u32_x(pg, acc_6_8, acc_2_12, half_kernel_[2]); - // 1 - 13 svuint32_t acc_1_13 = svadd_u32_x(pg, src_1, src_13); - acc = svmla_n_u32_x(pg, acc, acc_1_13, half_kernel_[1]); + svuint32_t acc2 = svmla_n_u32_x(pg, acc_5_9, acc_1_13, half_kernel_[1]); - // 0 - 14 svuint32_t acc_0_14 = svadd_u32_x(pg, src_0, src_14); - acc = svmla_n_u32_x(pg, acc, acc_0_14, half_kernel_[0]); + svuint32_t acc3 = svmla_n_u32_x(pg, acc_4_10, acc_0_14, half_kernel_[0]); - acc = svrshr_n_u32_x(pg, acc, 16); + svuint32_t acc4 = svmla_n_u32_x(pg, acc_3_11, src_7, half_kernel_[7]); + + acc1 = svadd_u32_x(pg, acc1, acc2); + acc2 = svadd_u32_x(pg, acc3, acc4); + + svuint32_t acc = svrshr_n_u32_x(pg, svadd_u32_x(pg, acc1, acc2), 16); svst1b_u32(pg, &dst[0], acc); } }; // end of class GaussianBlur -- GitLab