From b8ad1d8ec34d5564db1254a9d1be35ae94bd04b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Mon, 22 Jul 2024 14:36:25 +0200 Subject: [PATCH 1/6] Add OpenCV benchmarks for Separable Filter 2D --- scripts/benchmark/run_benchmarks_4K.sh | 2 ++ scripts/benchmark/run_benchmarks_FHD.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/scripts/benchmark/run_benchmarks_4K.sh b/scripts/benchmark/run_benchmarks_4K.sh index 73d44f625..837f8a3af 100755 --- a/scripts/benchmark/run_benchmarks_4K.sh +++ b/scripts/benchmark/run_benchmarks_4K.sh @@ -38,6 +38,8 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL BinaryThreshold opencv_perf_imgproc '*ThreshFixture_Threshold.Threshold/*' '(3840x2160, 8UC1, THRESH_BINARY)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL SepFilter2D_5x5 opencv_perf_imgproc '*KleidiCV_SepFilter2D.SepFilter2D/*' '(3840x2160, 8UC1, 5, BORDER_REPLICATE)')") + RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur3x3 opencv_perf_imgproc '*gaussianBlur3x3/*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur5x5 opencv_perf_imgproc '*gaussianBlur5x5/*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur7x7 opencv_perf_imgproc '*gaussianBlur7x7/*' '(3840x2160, 8UC1, BORDER_REPLICATE)')") diff --git a/scripts/benchmark/run_benchmarks_FHD.sh b/scripts/benchmark/run_benchmarks_FHD.sh index e5a3ee2bd..838061b88 100755 --- a/scripts/benchmark/run_benchmarks_FHD.sh +++ b/scripts/benchmark/run_benchmarks_FHD.sh @@ -38,6 +38,8 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL BinaryThreshold opencv_perf_imgproc '*ThreshFixture_Threshold.Threshold/*' '(1920x1080, 8UC1, THRESH_BINARY)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL SepFilter2D_5x5 opencv_perf_imgproc '*KleidiCV_SepFilter2D.SepFilter2D/*' '(1920x1080, 8UC1, 5, BORDER_REPLICATE)')") + RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur3x3 opencv_perf_imgproc '*gaussianBlur3x3/*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur5x5 opencv_perf_imgproc '*gaussianBlur5x5/*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL GaussianBlur7x7 opencv_perf_imgproc '*gaussianBlur7x7/*' '(1920x1080, 8UC1, BORDER_REPLICATE)')") -- GitLab From 296b21036a92300b3f5434436877000bf4eb28ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Thu, 18 Jul 2024 18:19:38 +0200 Subject: [PATCH 2/6] Refactor and unify border header files --- .../kleidicv/separable_filter_15x15_neon.h | 154 +++++----- .../kleidicv/separable_filter_15x15_sc.h | 124 ++++---- .../kleidicv/separable_filter_3x3_neon.h | 40 +-- .../kleidicv/separable_filter_3x3_sc.h | 28 +- .../kleidicv/separable_filter_5x5_neon.h | 54 ++-- .../kleidicv/separable_filter_5x5_sc.h | 44 +-- .../kleidicv/separable_filter_7x7_neon.h | 74 ++--- .../kleidicv/separable_filter_7x7_sc.h | 60 ++-- kleidicv/include/kleidicv/workspace/border.h | 206 +++++++++++++ .../include/kleidicv/workspace/border_15x15.h | 276 ------------------ .../include/kleidicv/workspace/border_3x3.h | 116 -------- .../include/kleidicv/workspace/border_5x5.h | 162 ---------- .../include/kleidicv/workspace/border_7x7.h | 181 ------------ 13 files changed, 495 insertions(+), 1024 deletions(-) create mode 100644 kleidicv/include/kleidicv/workspace/border.h delete mode 100644 kleidicv/include/kleidicv/workspace/border_15x15.h delete mode 100644 kleidicv/include/kleidicv/workspace/border_3x3.h delete mode 100644 kleidicv/include/kleidicv/workspace/border_5x5.h delete mode 100644 kleidicv/include/kleidicv/workspace/border_7x7.h diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h index 2475d1db3..7e6a4227f 100644 --- a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border_15x15.h" +#include "kleidicv/workspace/border.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -42,41 +42,41 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[15]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); - src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]); - src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]); - src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]); - src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]); - src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]); - src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]); - src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]); - src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); + src[7] = vld1q(&src_rows.at(border_offsets.c(7))[index]); + src[8] = vld1q(&src_rows.at(border_offsets.c(8))[index]); + src[9] = vld1q(&src_rows.at(border_offsets.c(9))[index]); + src[10] = vld1q(&src_rows.at(border_offsets.c(10))[index]); + src[11] = vld1q(&src_rows.at(border_offsets.c(11))[index]); + src[12] = vld1q(&src_rows.at(border_offsets.c(12))[index]); + src[13] = vld1q(&src_rows.at(border_offsets.c(13))[index]); + src[14] = vld1q(&src_rows.at(border_offsets.c(14))[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[15]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - src[5] = src_rows.at(border_offsets.c5())[index]; - src[6] = src_rows.at(border_offsets.c6())[index]; - src[7] = src_rows.at(border_offsets.c7())[index]; - src[8] = src_rows.at(border_offsets.c8())[index]; - src[9] = src_rows.at(border_offsets.c9())[index]; - src[10] = src_rows.at(border_offsets.c10())[index]; - src[11] = src_rows.at(border_offsets.c11())[index]; - src[12] = src_rows.at(border_offsets.c12())[index]; - src[13] = src_rows.at(border_offsets.c13())[index]; - src[14] = src_rows.at(border_offsets.c14())[index]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + src[3] = src_rows.at(border_offsets.c(3))[index]; + src[4] = src_rows.at(border_offsets.c(4))[index]; + src[5] = src_rows.at(border_offsets.c(5))[index]; + src[6] = src_rows.at(border_offsets.c(6))[index]; + src[7] = src_rows.at(border_offsets.c(7))[index]; + src[8] = src_rows.at(border_offsets.c(8))[index]; + src[9] = src_rows.at(border_offsets.c(9))[index]; + src[10] = src_rows.at(border_offsets.c(10))[index]; + src[11] = src_rows.at(border_offsets.c(11))[index]; + src[12] = src_rows.at(border_offsets.c(12))[index]; + src[13] = src_rows.at(border_offsets.c(13))[index]; + src[14] = src_rows.at(border_offsets.c(14))[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -88,21 +88,21 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; - auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; - auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; - auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; - auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; - auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; - auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; - auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; - auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; + auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; + auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; + auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; + auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; + auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; + auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; + auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; BufferVectorType src_a[15], src_b[15]; src_a[0] = vld1q(&src_0[0]); @@ -143,21 +143,21 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[15]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); - src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]); - src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]); - src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]); - src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]); - src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]); - src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]); - src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]); - src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); + src[7] = vld1q(&src_rows.at(0, border_offsets.c(7))[index]); + src[8] = vld1q(&src_rows.at(0, border_offsets.c(8))[index]); + src[9] = vld1q(&src_rows.at(0, border_offsets.c(9))[index]); + src[10] = vld1q(&src_rows.at(0, border_offsets.c(10))[index]); + src[11] = vld1q(&src_rows.at(0, border_offsets.c(11))[index]); + src[12] = vld1q(&src_rows.at(0, border_offsets.c(12))[index]); + src[13] = vld1q(&src_rows.at(0, border_offsets.c(13))[index]); + src[14] = vld1q(&src_rows.at(0, border_offsets.c(14))[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -181,21 +181,21 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - src[7] = src_rows.at(0, border_offsets.c7())[index]; - src[8] = src_rows.at(0, border_offsets.c8())[index]; - src[9] = src_rows.at(0, border_offsets.c9())[index]; - src[10] = src_rows.at(0, border_offsets.c10())[index]; - src[11] = src_rows.at(0, border_offsets.c11())[index]; - src[12] = src_rows.at(0, border_offsets.c12())[index]; - src[13] = src_rows.at(0, border_offsets.c13())[index]; - src[14] = src_rows.at(0, border_offsets.c14())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; + src[7] = src_rows.at(0, border_offsets.c(7))[index]; + src[8] = src_rows.at(0, border_offsets.c(8))[index]; + src[9] = src_rows.at(0, border_offsets.c(9))[index]; + src[10] = src_rows.at(0, border_offsets.c(10))[index]; + src[11] = src_rows.at(0, border_offsets.c(11))[index]; + src[12] = src_rows.at(0, border_offsets.c(12))[index]; + src[13] = src_rows.at(0, border_offsets.c(13))[index]; + src[14] = src_rows.at(0, border_offsets.c(14))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h index f95067a09..b38193b51 100644 --- a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_15x15.h" +#include "kleidicv/workspace/border.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,35 +95,35 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); + svld1(pg, &src_rows.at(border_offsets.c(3))[index]); SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); + svld1(pg, &src_rows.at(border_offsets.c(4))[index]); SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c5())[index]); + svld1(pg, &src_rows.at(border_offsets.c(5))[index]); SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c6())[index]); + svld1(pg, &src_rows.at(border_offsets.c(6))[index]); SourceVectorType src_7 = - svld1(pg, &src_rows.at(border_offsets.c7())[index]); + svld1(pg, &src_rows.at(border_offsets.c(7))[index]); SourceVectorType src_8 = - svld1(pg, &src_rows.at(border_offsets.c8())[index]); + svld1(pg, &src_rows.at(border_offsets.c(8))[index]); SourceVectorType src_9 = - svld1(pg, &src_rows.at(border_offsets.c9())[index]); + svld1(pg, &src_rows.at(border_offsets.c(9))[index]); SourceVectorType src_10 = - svld1(pg, &src_rows.at(border_offsets.c10())[index]); + svld1(pg, &src_rows.at(border_offsets.c(10))[index]); SourceVectorType src_11 = - svld1(pg, &src_rows.at(border_offsets.c11())[index]); + svld1(pg, &src_rows.at(border_offsets.c(11))[index]); SourceVectorType src_12 = - svld1(pg, &src_rows.at(border_offsets.c12())[index]); + svld1(pg, &src_rows.at(border_offsets.c(12))[index]); SourceVectorType src_13 = - svld1(pg, &src_rows.at(border_offsets.c13())[index]); + svld1(pg, &src_rows.at(border_offsets.c(13))[index]); SourceVectorType src_14 = - svld1(pg, &src_rows.at(border_offsets.c14())[index]); + svld1(pg, &src_rows.at(border_offsets.c(14))[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, src_9, src_10, src_11, src_12, src_13, src_14, &dst_rows[index]); @@ -133,21 +133,21 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; - auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; - auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; - auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; - auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; - auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; - auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; - auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; - auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; + auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; + auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; + auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; + auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; + auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; + auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; + auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -195,35 +195,35 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); BufferVectorType src_7 = - svld1(pg, &src_rows.at(0, border_offsets.c7())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(7))[index]); BufferVectorType src_8 = - svld1(pg, &src_rows.at(0, border_offsets.c8())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(8))[index]); BufferVectorType src_9 = - svld1(pg, &src_rows.at(0, border_offsets.c9())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(9))[index]); BufferVectorType src_10 = - svld1(pg, &src_rows.at(0, border_offsets.c10())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(10))[index]); BufferVectorType src_11 = - svld1(pg, &src_rows.at(0, border_offsets.c11())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(11))[index]); BufferVectorType src_12 = - svld1(pg, &src_rows.at(0, border_offsets.c12())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(12))[index]); BufferVectorType src_13 = - svld1(pg, &src_rows.at(0, border_offsets.c13())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(13))[index]); BufferVectorType src_14 = - svld1(pg, &src_rows.at(0, border_offsets.c14())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(14))[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, src_9, src_10, src_11, src_12, src_13, src_14, &dst_rows[index]); @@ -234,21 +234,21 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - src[7] = src_rows.at(0, border_offsets.c7())[index]; - src[8] = src_rows.at(0, border_offsets.c8())[index]; - src[9] = src_rows.at(0, border_offsets.c9())[index]; - src[10] = src_rows.at(0, border_offsets.c10())[index]; - src[11] = src_rows.at(0, border_offsets.c11())[index]; - src[12] = src_rows.at(0, border_offsets.c12())[index]; - src[13] = src_rows.at(0, border_offsets.c13())[index]; - src[14] = src_rows.at(0, border_offsets.c14())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; + src[7] = src_rows.at(0, border_offsets.c(7))[index]; + src[8] = src_rows.at(0, border_offsets.c(8))[index]; + src[9] = src_rows.at(0, border_offsets.c(9))[index]; + src[10] = src_rows.at(0, border_offsets.c(10))[index]; + src[11] = src_rows.at(0, border_offsets.c(11))[index]; + src[12] = src_rows.at(0, border_offsets.c(12))[index]; + src[13] = src_rows.at(0, border_offsets.c(13))[index]; + src[14] = src_rows.at(0, border_offsets.c(14))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h index 3fecea047..d26e54d86 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border_3x3.h" +#include "kleidicv/workspace/border.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -41,9 +41,9 @@ class SeparableFilter { SourceVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(border_offsets.c0())[index]; - auto src_1 = &src_rows.at(border_offsets.c1())[index]; - auto src_2 = &src_rows.at(border_offsets.c2())[index]; + auto src_0 = &src_rows.at(border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(border_offsets.c(2))[index]; auto src_0_x2 = vld1q_x2(&src_0[0]); auto src_1_x2 = vld1q_x2(&src_1[0]); @@ -64,17 +64,17 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[3]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[3]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -86,9 +86,9 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; auto src_0_x2 = vld1q_x2(&src_0[0]); auto src_1_x2 = vld1q_x2(&src_1[0]); @@ -109,9 +109,9 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[3]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -135,9 +135,9 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h index 6f624ae1c..8c7e092ab 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_3x3.h" +#include "kleidicv/workspace/border.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,11 +95,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); } @@ -107,9 +107,9 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -130,11 +130,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); } @@ -143,9 +143,9 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h index 34f4290d7..46746dbd8 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border_5x5.h" +#include "kleidicv/workspace/border.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -42,21 +42,21 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[5]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[5]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + src[3] = src_rows.at(border_offsets.c(3))[index]; + src[4] = src_rows.at(border_offsets.c(4))[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -68,11 +68,11 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; BufferVectorType src_a[5], src_b[5]; src_a[0] = vld1q(&src_0[0]); @@ -93,11 +93,11 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[5]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -121,11 +121,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h index 909e8ce18..3ca4075cf 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_5X5_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_5x5.h" +#include "kleidicv/workspace/border.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,15 +95,15 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); + svld1(pg, &src_rows.at(border_offsets.c(3))[index]); SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); + svld1(pg, &src_rows.at(border_offsets.c(4))[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, &dst_rows[index]); } @@ -112,11 +112,11 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -141,15 +141,15 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, &dst_rows[index]); } @@ -159,11 +159,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h index 4305d9d06..2d804d933 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border_7x7.h" +#include "kleidicv/workspace/border.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -42,25 +42,25 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[7]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[7]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - src[5] = src_rows.at(border_offsets.c5())[index]; - src[6] = src_rows.at(border_offsets.c6())[index]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + src[3] = src_rows.at(border_offsets.c(3))[index]; + src[4] = src_rows.at(border_offsets.c(4))[index]; + src[5] = src_rows.at(border_offsets.c(5))[index]; + src[6] = src_rows.at(border_offsets.c(6))[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -72,13 +72,13 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; BufferVectorType src_a[7], src_b[7]; src_a[0] = vld1q(&src_0[0]); @@ -103,13 +103,13 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[7]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -133,13 +133,13 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h index 33f204a10..eab3df4b1 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_7X7_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_7x7.h" +#include "kleidicv/workspace/border.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,19 +95,19 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); + svld1(pg, &src_rows.at(border_offsets.c(3))[index]); SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); + svld1(pg, &src_rows.at(border_offsets.c(4))[index]); SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c5())[index]); + svld1(pg, &src_rows.at(border_offsets.c(5))[index]); SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c6())[index]); + svld1(pg, &src_rows.at(border_offsets.c(6))[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, &dst_rows[index]); } @@ -116,13 +116,13 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -151,19 +151,19 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); + svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, &dst_rows[index]); } @@ -173,13 +173,13 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/workspace/border.h b/kleidicv/include/kleidicv/workspace/border.h new file mode 100644 index 000000000..35d5f5415 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border.h @@ -0,0 +1,206 @@ +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_H +#define KLEIDICV_WORKSPACE_BORDER_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + // Note: We are not using the default constructor, but it is defined for the + // unreachable cases in the code below. NOLINTNEXTLINE + class Offsets final { + public: + Offsets() = default; + + template + explicit Offsets(Args... args) : offsets_{static_cast(args)...} { + static_assert(sizeof...(args) == KernelSize); + } + + size_t c(int i) const { return offsets_[i]; } + + private: + size_t offsets_[KernelSize]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + constexpr auto seq = std::make_integer_sequence> 1)>{}; + return get_no_border(seq); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + constexpr auto seq = std::make_integer_sequence> 1)>{}; + switch (border_type_) { + case FixedBorderType::REPLICATE: + return get_border(column_index, seq); + break; + + case FixedBorderType::REFLECT: + return get_border(column_index, seq); + break; + + case FixedBorderType::WRAP: + return get_border(column_index, seq); + break; + + case FixedBorderType::REVERSE: + return get_border(column_index, seq); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + constexpr auto seq = std::make_integer_sequence> 1)>{}; + column_index = height_ - column_index - 1; + switch (border_type_) { + case FixedBorderType::REPLICATE: + return get_border(column_index, seq); + break; + + case FixedBorderType::REFLECT: + return get_border(column_index, seq); + break; + + case FixedBorderType::WRAP: + return get_border(column_index, seq); + break; + + case FixedBorderType::REVERSE: + return get_border(column_index, seq); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index < (KernelSize >> 1)) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - (KernelSize >> 1))) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Creates the Offsets object containing offsets in the interval + // [-(KernelSize / 2), KernelSize / 2]. + template + inline Offsets get_no_border(std::integer_sequence) const + KLEIDICV_STREAMING_COMPATIBLE { + // Example (15x15): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + // 7}; + return Offsets{(SeqNum - (KernelSize >> 1))..., 0, (SeqNum + 1)...}; + } + + // Creates the Offsets object containing offsets in various intervals + // depending on the column, border type as well the border position used. + // NOLINTBEGIN(readability-function-cognitive-complexity) + template + inline Offsets get_border(int column, std::integer_sequence) + const KLEIDICV_STREAMING_COMPATIBLE { + if constexpr (BorderType == FixedBorderType::REPLICATE && !IsRight) { + // Example (15x15, index 4, left): Offsets{-4, -4, -4, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 5, 6, 7}; + return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -column) + ? -column + : (SeqNum - (KernelSize >> 1))..., + 0, (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::REPLICATE && IsRight) { + // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 4, 4, 4}; + return Offsets{(SeqNum - (KernelSize >> 1))..., 0, + (SeqNum >= column) ? column : (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::REFLECT && !IsRight) { + // Example (15x15, index 4, left): Offsets{-2, -3, -4, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 5, 6, 7}; + return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -column) + ? ((KernelSize >> 1) - (column << 1) - (SeqNum + 1)) + : (SeqNum - (KernelSize >> 1))..., + 0, (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::REFLECT && IsRight) { + // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 4, 3, 2}; + return Offsets{ + (SeqNum - (KernelSize >> 1))..., 0, + (SeqNum >= column) ? ((column << 1) - SeqNum) : (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::WRAP && !IsRight) { + // Example (15x15, index 4, left): Offsets{height_ - 7, height_ - 6, + // height_ - 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}; + return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -column) + ? (SeqNum - (KernelSize >> 1) + height_) + : (SeqNum - (KernelSize >> 1))..., + 0, (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::WRAP && IsRight) { + // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 5 - height_, 6 - height_, 7 - height_}; + return Offsets{ + (SeqNum - (KernelSize >> 1))..., 0, + (SeqNum >= column) ? (SeqNum - height_ + 1) : (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::REVERSE && !IsRight) { + // Example (15x15, index 4, left): Offsets{-1, -2, -3, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 5, 6, 7}; + return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -column) + ? ((KernelSize >> 1) - (column << 1) - SeqNum) + : (SeqNum - (KernelSize >> 1))..., + 0, (SeqNum + 1)...}; + } + + if constexpr (BorderType == FixedBorderType::REVERSE && IsRight) { + // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, + // 1, 2, 3, 4, 3, 2, 1}; + return Offsets{(SeqNum - (KernelSize >> 1))..., 0, + (SeqNum >= column) ? ((column << 1) - (SeqNum + 1)) + : (SeqNum + 1)...}; + } + } + // NOLINTEND(readability-function-cognitive-complexity) + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_H diff --git a/kleidicv/include/kleidicv/workspace/border_15x15.h b/kleidicv/include/kleidicv/workspace/border_15x15.h deleted file mode 100644 index eb3ae12ad..000000000 --- a/kleidicv/include/kleidicv/workspace/border_15x15.h +++ /dev/null @@ -1,276 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_WORKSPACE_BORDER_15X15_H -#define KLEIDICV_WORKSPACE_BORDER_15X15_H - -#include "border_types.h" -#include "kleidicv/kleidicv.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Border offsets for fixed-size filters. -template -class FixedBorderInfo; - -// Border offsets for 15x15 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - // NOLINTBEGIN(hicpp-member-init) - Offsets() = default; - // NOLINTEND(hicpp-member-init) - - Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, - size_t o6, size_t o7, size_t o8, size_t o9, size_t o10, size_t o11, - size_t o12, size_t o13, size_t o14) - : offsets_{o0, o1, o2, o3, o4, o5, o6, o7, - o8, o9, o10, o11, o12, o13, o14} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - size_t c5() const { return offsets_[5]; } - size_t c6() const { return offsets_[6]; } - size_t c7() const { return offsets_[7]; } - size_t c8() const { return offsets_[8]; } - size_t c9() const { return offsets_[9]; } - size_t c10() const { return offsets_[10]; } - size_t c11() const { return offsets_[11]; } - size_t c12() const { return offsets_[12]; } - size_t c13() const { return offsets_[13]; } - size_t c14() const { return offsets_[14]; } - - private: - size_t offsets_[15]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } - - // NOLINTBEGIN(readability-function-cognitive-complexity) - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == 0) { - return get(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 1) { - return get(-1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 2) { - return get(-2, -2, -2, -2, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 3) { - return get(-3, -3, -3, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 4) { - return get(-4, -4, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 5) { - return get(-5, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else { - return get(-6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == 0) { - return get(6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 1) { - return get(4, 3, 2, 1, 0, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 2) { - return get(2, 1, 0, -1, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 3) { - return get(0, -1, -2, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 4) { - return get(-2, -3, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 5) { - return get(-4, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else { - return get(-6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } - break; - - case FixedBorderType::WRAP: - if (column_index == 0) { - return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, - height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3, 4, 5, 6, - 7); - } else if (column_index == 1) { - return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, - height_ - 3, height_ - 2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 2) { - return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, - height_ - 3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 3) { - return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, -3, -2, - -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 4) { - return get(height_ - 7, height_ - 6, height_ - 5, -4, -3, -2, -1, 0, - 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 5) { - return get(height_ - 7, height_ - 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, - 4, 5, 6, 7); - } else { - return get(height_ - 7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, - 7); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == 0) { - return get(7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 1) { - return get(5, 4, 3, 2, 1, 0, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 2) { - return get(3, 2, 1, 0, -1, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 3) { - return get(1, 0, -1, -2, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 4) { - return get(-1, -2, -3, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else if (column_index == 5) { - return get(-3, -4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } else { - return get(-5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == (height_ - 7)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 6); - } else if (column_index == (height_ - 6)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 5, 5); - } else if (column_index == (height_ - 5)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 4, 4); - } else if (column_index == (height_ - 4)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 3, 3, 3); - } else if (column_index == (height_ - 3)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 2, 2, 2, 2); - } else if (column_index == (height_ - 2)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 1, 1, 1, 1, 1); - } else { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == (height_ - 7)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 6); - } else if (column_index == (height_ - 6)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 5, 4); - } else if (column_index == (height_ - 5)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 3, 2); - } else if (column_index == (height_ - 4)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 2, 1, 0); - } else if (column_index == (height_ - 3)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 1, 0, -1, -2); - } else if (column_index == (height_ - 2)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 0, -1, -2, -3, -4); - } else { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 0, -1, -2, -3, -4, -5, -6); - } - break; - - case FixedBorderType::WRAP: - if (column_index == (height_ - 7)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, - 7 - height_); - } else if (column_index == (height_ - 6)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6 - height_, - 7 - height_); - } else if (column_index == (height_ - 5)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 - height_, - 6 - height_, 7 - height_); - } else if (column_index == (height_ - 4)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 - height_, - 5 - height_, 6 - height_, 7 - height_); - } else if (column_index == (height_ - 3)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3 - height_, - 4 - height_, 5 - height_, 6 - height_, 7 - height_); - } else if (column_index == (height_ - 2)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2 - height_, 3 - height_, - 4 - height_, 5 - height_, 6 - height_, 7 - height_); - } else { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1 - height_, 2 - height_, - 3 - height_, 4 - height_, 5 - height_, 6 - height_, - 7 - height_); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == (height_ - 7)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 5); - } else if (column_index == (height_ - 6)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 4, 3); - } else if (column_index == (height_ - 5)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 3, 2, 1); - } else if (column_index == (height_ - 4)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 2, 1, 0, -1); - } else if (column_index == (height_ - 3)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 1, 0, -1, -2, -3); - } else if (column_index == (height_ - 2)) { - return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 0, -1, -2, -3, -4, -5); - } else { - return get(-7, -6, -5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5, -6, -7); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - // NOLINTEND(readability-function-cognitive-complexity) - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index <= 6U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index >= (height_ - 7U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, - size_t o6, size_t o7, size_t o8, size_t o9, size_t o10, - size_t o11, size_t o12, size_t o13, - size_t o14) const KLEIDICV_STREAMING_COMPATIBLE { - return Offsets{o0, o1, o2, o3, o4, o5, o6, o7, - o8, o9, o10, o11, o12, o13, o14}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Shorthand for 15x15 filter border type. -template -using FixedBorderInfo15x15 = FixedBorderInfo; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_WORKSPACE_BORDER_15X15_H diff --git a/kleidicv/include/kleidicv/workspace/border_3x3.h b/kleidicv/include/kleidicv/workspace/border_3x3.h deleted file mode 100644 index ecd5627d3..000000000 --- a/kleidicv/include/kleidicv/workspace/border_3x3.h +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_WORKSPACE_BORDER_3X3_H -#define KLEIDICV_WORKSPACE_BORDER_3X3_H - -#include "border_types.h" -#include "kleidicv/kleidicv.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Border offsets for fixed-size filters. -template -class FixedBorderInfo; - -// Border offsets for 3x3 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - Offsets() = default; - - Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - - private: - size_t offsets_[3]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const { return get(-1, 0, 1); } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - case FixedBorderType::REFLECT: - return get(0, 0, 1); - break; - - case FixedBorderType::WRAP: - return get(height_ - 1, 0, 1); - break; - - case FixedBorderType::REVERSE: - return get(1, 0, 1); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - case FixedBorderType::REFLECT: - return get(-1, 0, 0); - break; - - case FixedBorderType::WRAP: - return get(-1, 0, 1 - height_); - break; - - case FixedBorderType::REVERSE: - return get(-1, 0, -1); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index == 0U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index == (height_ - 1U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2) const { - return Offsets{o0, o1, o2}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Shorthand for 3x3 filter border type. -template -using FixedBorderInfo3x3 = FixedBorderInfo; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_WORKSPACE_BORDER_3X3_H diff --git a/kleidicv/include/kleidicv/workspace/border_5x5.h b/kleidicv/include/kleidicv/workspace/border_5x5.h deleted file mode 100644 index 06c2683bd..000000000 --- a/kleidicv/include/kleidicv/workspace/border_5x5.h +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_WORKSPACE_BORDER_5X5_H -#define KLEIDICV_WORKSPACE_BORDER_5X5_H - -#include "border_types.h" -#include "kleidicv/kleidicv.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Border offsets for fixed-size filters. -template -class FixedBorderInfo; - -// Border offsets for 5x5 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - // NOLINTBEGIN(hicpp-member-init) - Offsets() = default; - // NOLINTEND(hicpp-member-init) - - Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4) - : offsets_{o0, o1, o2, o3, o4} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - - private: - size_t offsets_[5]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { - return get(-2, -1, 0, 1, 2); - } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == 0) { - return get(0, 0, 0, 1, 2); - } else { - return get(-1, -1, 0, 1, 2); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == 0) { - return get(1, 0, 0, 1, 2); - } else { - return get(-1, -1, 0, 1, 2); - } - break; - - case FixedBorderType::WRAP: - if (column_index == 0) { - return get(height_ - 2, height_ - 1, 0, 1, 2); - } else { - return get(height_ - 2, -1, 0, 1, 2); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == 0) { - return get(2, 1, 0, 1, 2); - } else { - return get(0, -1, 0, 1, 2); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 1); - } else { - return get(-2, -1, 0, 0, 0); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 1); - } else { - return get(-2, -1, 0, 0, -1); - } - break; - - case FixedBorderType::WRAP: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 2 - height_); - } else { - return get(-2, -1, 0, 1 - height_, 2 - height_); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == (height_ - 2)) { - return get(-2, -1, 0, 1, 0); - } else { - return get(-2, -1, 0, -1, -2); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index <= 1U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index >= (height_ - 2U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, - size_t o4) const KLEIDICV_STREAMING_COMPATIBLE { - return Offsets{o0, o1, o2, o3, o4}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Shorthand for 5x5 filter border type. -template -using FixedBorderInfo5x5 = FixedBorderInfo; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_WORKSPACE_BORDER_5X5_H diff --git a/kleidicv/include/kleidicv/workspace/border_7x7.h b/kleidicv/include/kleidicv/workspace/border_7x7.h deleted file mode 100644 index 75bb86117..000000000 --- a/kleidicv/include/kleidicv/workspace/border_7x7.h +++ /dev/null @@ -1,181 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_WORKSPACE_BORDER_7X7_H -#define KLEIDICV_WORKSPACE_BORDER_7X7_H - -#include "border_types.h" -#include "kleidicv/kleidicv.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Border offsets for fixed-size filters. -template -class FixedBorderInfo; - -// Border offsets for 7x7 filters. -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - class Offsets final { - public: - // NOLINTBEGIN(hicpp-member-init) - Offsets() = default; - // NOLINTEND(hicpp-member-init) - - Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, - size_t o6) - : offsets_{o0, o1, o2, o3, o4, o5, o6} {} - - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - size_t c5() const { return offsets_[5]; } - size_t c6() const { return offsets_[6]; } - - private: - size_t offsets_[7]; - }; - - FixedBorderInfo(size_t height, FixedBorderType border_type) - : height_(height), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { - return get(-3, -2, -1, 0, 1, 2, 3); - } - - // Returns offsets for columns affected by left border. - Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == 0) { - return get(0, 0, 0, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(-1, -1, -1, 0, 1, 2, 3); - } else { - return get(-2, -2, -1, 0, 1, 2, 3); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == 0) { - return get(2, 1, 0, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(0, -1, -1, 0, 1, 2, 3); - } else { - return get(-2, -2, -1, 0, 1, 2, 3); - } - break; - - case FixedBorderType::WRAP: - if (column_index == 0) { - return get(height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(height_ - 3, height_ - 2, -1, 0, 1, 2, 3); - } else { - return get(height_ - 3, -2, -1, 0, 1, 2, 3); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == 0) { - return get(3, 2, 1, 0, 1, 2, 3); - } else if (column_index == 1) { - return get(1, 0, -1, 0, 1, 2, 3); - } else { - return get(-1, -2, -1, 0, 1, 2, 3); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for columns affected by right border. - Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - switch (border_type_) { - case FixedBorderType::REPLICATE: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 2); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 1, 1); - } else { - return get(-3, -2, -1, 0, 0, 0, 0); - } - break; - - case FixedBorderType::REFLECT: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 2); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 1, 0); - } else { - return get(-3, -2, -1, 0, 0, -1, -2); - } - break; - - case FixedBorderType::WRAP: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 3 - height_); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 2 - height_, 3 - height_); - } else { - return get(-3, -2, -1, 0, 1 - height_, 2 - height_, 3 - height_); - } - break; - - case FixedBorderType::REVERSE: - if (column_index == (height_ - 3)) { - return get(-3, -2, -1, 0, 1, 2, 1); - } else if (column_index == (height_ - 2)) { - return get(-3, -2, -1, 0, 1, 0, -1); - } else { - return get(-3, -2, -1, 0, -1, -2, -3); - } - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows or columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index <= 2U) { - // Rows and columns have the same offsets. - return offsets_with_left_border(row_or_column_index); - } - if (row_or_column_index >= (height_ - 3U)) { - // Rows and columns have the same offsets. - return offsets_with_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Takes care of static signed to unsigned casts. - Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, - size_t o6) const KLEIDICV_STREAMING_COMPATIBLE { - return Offsets{o0, o1, o2, o3, o4, o5, o6}; - } - - size_t height_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -// Shorthand for 7x7 filter border type. -template -using FixedBorderInfo7x7 = FixedBorderInfo; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_WORKSPACE_BORDER_7X7_H -- GitLab From 239d7d24e18a788273248503244800a395bcec44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 19 Jul 2024 13:17:25 +0200 Subject: [PATCH 3/6] Refactor and unify separable filter drivers --- .../kleidicv/separable_filter_15x15_neon.h | 211 -------------- .../kleidicv/separable_filter_15x15_sc.h | 264 ------------------ .../kleidicv/separable_filter_3x3_neon.h | 153 ---------- .../kleidicv/separable_filter_5x5_neon.h | 141 ---------- .../kleidicv/separable_filter_5x5_sc.h | 179 ------------ .../kleidicv/separable_filter_7x7_neon.h | 155 ---------- .../kleidicv/separable_filter_7x7_sc.h | 195 ------------- .../kleidicv/separable_filter_driver_neon.h | 140 ++++++++++ ..._3x3_sc.h => separable_filter_driver_sc.h} | 127 ++++----- kleidicv/src/filters/gaussian_blur_neon.cpp | 7 +- kleidicv/src/filters/gaussian_blur_sc.h | 7 +- .../src/filters/separable_filter_2d_neon.cpp | 4 +- kleidicv/src/filters/separable_filter_2d_sc.h | 4 +- kleidicv/src/filters/sobel_neon.cpp | 7 +- kleidicv/src/filters/sobel_sc.h | 7 +- 15 files changed, 214 insertions(+), 1387 deletions(-) delete mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_sc.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_3x3_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_sc.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_sc.h create mode 100644 kleidicv/include/kleidicv/separable_filter_driver_neon.h rename kleidicv/include/kleidicv/{separable_filter_3x3_sc.h => separable_filter_driver_sc.h} (52%) diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h deleted file mode 100644 index 7e6a4227f..000000000 --- a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h +++ /dev/null @@ -1,211 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 15x15 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 7UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[15]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); - src[7] = vld1q(&src_rows.at(border_offsets.c(7))[index]); - src[8] = vld1q(&src_rows.at(border_offsets.c(8))[index]); - src[9] = vld1q(&src_rows.at(border_offsets.c(9))[index]); - src[10] = vld1q(&src_rows.at(border_offsets.c(10))[index]); - src[11] = vld1q(&src_rows.at(border_offsets.c(11))[index]); - src[12] = vld1q(&src_rows.at(border_offsets.c(12))[index]); - src[13] = vld1q(&src_rows.at(border_offsets.c(13))[index]); - src[14] = vld1q(&src_rows.at(border_offsets.c(14))[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[15]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - src[3] = src_rows.at(border_offsets.c(3))[index]; - src[4] = src_rows.at(border_offsets.c(4))[index]; - src[5] = src_rows.at(border_offsets.c(5))[index]; - src[6] = src_rows.at(border_offsets.c(6))[index]; - src[7] = src_rows.at(border_offsets.c(7))[index]; - src[8] = src_rows.at(border_offsets.c(8))[index]; - src[9] = src_rows.at(border_offsets.c(9))[index]; - src[10] = src_rows.at(border_offsets.c(10))[index]; - src[11] = src_rows.at(border_offsets.c(11))[index]; - src[12] = src_rows.at(border_offsets.c(12))[index]; - src[13] = src_rows.at(border_offsets.c(13))[index]; - src[14] = src_rows.at(border_offsets.c(14))[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; - auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; - auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; - auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; - auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; - auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; - auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; - auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; - auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; - - BufferVectorType src_a[15], src_b[15]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - src_a[5] = vld1q(&src_5[0]); - src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); - src_a[6] = vld1q(&src_6[0]); - src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); - src_a[7] = vld1q(&src_7[0]); - src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]); - src_a[8] = vld1q(&src_8[0]); - src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]); - src_a[9] = vld1q(&src_9[0]); - src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]); - src_a[10] = vld1q(&src_10[0]); - src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]); - src_a[11] = vld1q(&src_11[0]); - src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]); - src_a[12] = vld1q(&src_12[0]); - src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]); - src_a[13] = vld1q(&src_13[0]); - src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]); - src_a[14] = vld1q(&src_14[0]); - src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[15]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); - src[7] = vld1q(&src_rows.at(0, border_offsets.c(7))[index]); - src[8] = vld1q(&src_rows.at(0, border_offsets.c(8))[index]); - src[9] = vld1q(&src_rows.at(0, border_offsets.c(9))[index]); - src[10] = vld1q(&src_rows.at(0, border_offsets.c(10))[index]); - src[11] = vld1q(&src_rows.at(0, border_offsets.c(11))[index]); - src[12] = vld1q(&src_rows.at(0, border_offsets.c(12))[index]); - src[13] = vld1q(&src_rows.at(0, border_offsets.c(13))[index]); - src[14] = vld1q(&src_rows.at(0, border_offsets.c(14))[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; - src[7] = src_rows.at(0, border_offsets.c(7))[index]; - src[8] = src_rows.at(0, border_offsets.c(8))[index]; - src[9] = src_rows.at(0, border_offsets.c(9))[index]; - src[10] = src_rows.at(0, border_offsets.c(10))[index]; - src[11] = src_rows.at(0, border_offsets.c(11))[index]; - src[12] = src_rows.at(0, border_offsets.c(12))[index]; - src[13] = src_rows.at(0, border_offsets.c(13))[index]; - src[14] = src_rows.at(0, border_offsets.c(14))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 15x15 separable filters driver type. -template -using SeparableFilter15x15 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h deleted file mode 100644 index b38193b51..000000000 --- a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_SC_H -#define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 15x15 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 7UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c(3))[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c(4))[index]); - SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c(5))[index]); - SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c(6))[index]); - SourceVectorType src_7 = - svld1(pg, &src_rows.at(border_offsets.c(7))[index]); - SourceVectorType src_8 = - svld1(pg, &src_rows.at(border_offsets.c(8))[index]); - SourceVectorType src_9 = - svld1(pg, &src_rows.at(border_offsets.c(9))[index]); - SourceVectorType src_10 = - svld1(pg, &src_rows.at(border_offsets.c(10))[index]); - SourceVectorType src_11 = - svld1(pg, &src_rows.at(border_offsets.c(11))[index]); - SourceVectorType src_12 = - svld1(pg, &src_rows.at(border_offsets.c(12))[index]); - SourceVectorType src_13 = - svld1(pg, &src_rows.at(border_offsets.c(13))[index]); - SourceVectorType src_14 = - svld1(pg, &src_rows.at(border_offsets.c(14))[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; - auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; - auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; - auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; - auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; - auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; - auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; - auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; - auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - BufferVectorType src_0_5 = svld1(pg, &src_5[0]); - BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); - BufferVectorType src_0_6 = svld1(pg, &src_6[0]); - BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); - BufferVectorType src_0_7 = svld1(pg, &src_7[0]); - BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1); - BufferVectorType src_0_8 = svld1(pg, &src_8[0]); - BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1); - BufferVectorType src_0_9 = svld1(pg, &src_9[0]); - BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1); - BufferVectorType src_0_10 = svld1(pg, &src_10[0]); - BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1); - BufferVectorType src_0_11 = svld1(pg, &src_11[0]); - BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1); - BufferVectorType src_0_12 = svld1(pg, &src_12[0]); - BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1); - BufferVectorType src_0_13 = svld1(pg, &src_13[0]); - BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1); - BufferVectorType src_0_14 = svld1(pg, &src_14[0]); - BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, src_0_5, src_0_6, src_0_7, src_0_8, - src_0_9, src_0_10, src_0_11, src_0_12, - src_0_13, src_0_14, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, - src_1_14, &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); - BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); - BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); - BufferVectorType src_7 = - svld1(pg, &src_rows.at(0, border_offsets.c(7))[index]); - BufferVectorType src_8 = - svld1(pg, &src_rows.at(0, border_offsets.c(8))[index]); - BufferVectorType src_9 = - svld1(pg, &src_rows.at(0, border_offsets.c(9))[index]); - BufferVectorType src_10 = - svld1(pg, &src_rows.at(0, border_offsets.c(10))[index]); - BufferVectorType src_11 = - svld1(pg, &src_rows.at(0, border_offsets.c(11))[index]); - BufferVectorType src_12 = - svld1(pg, &src_rows.at(0, border_offsets.c(12))[index]); - BufferVectorType src_13 = - svld1(pg, &src_rows.at(0, border_offsets.c(13))[index]); - BufferVectorType src_14 = - svld1(pg, &src_rows.at(0, border_offsets.c(14))[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; - src[7] = src_rows.at(0, border_offsets.c(7))[index]; - src[8] = src_rows.at(0, border_offsets.c(8))[index]; - src[9] = src_rows.at(0, border_offsets.c(9))[index]; - src[10] = src_rows.at(0, border_offsets.c(10))[index]; - src[11] = src_rows.at(0, border_offsets.c(11))[index]; - src[12] = src_rows.at(0, border_offsets.c(12))[index]; - src[13] = src_rows.at(0, border_offsets.c(13))[index]; - src[14] = src_rows.at(0, border_offsets.c(14))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 15x15 separable filters driver type. -template -using SeparableFilter15x15 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_15X15_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h deleted file mode 100644 index d26e54d86..000000000 --- a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 1UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(border_offsets.c(2))[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - SourceVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.vertical_vector_path(src_a, &dst_rows[index]); - filter_.vertical_vector_path( - src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - SourceVectorType src[3]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[3]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - BufferVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[3]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h deleted file mode 100644 index 46746dbd8..000000000 --- a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 2UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[5]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[5]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - src[3] = src_rows.at(border_offsets.c(3))[index]; - src[4] = src_rows.at(border_offsets.c(4))[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - - BufferVectorType src_a[5], src_b[5]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[5]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h deleted file mode 100644 index 3ca4075cf..000000000 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_SC_H -#define KLEIDICV_SEPARABLE_FILTER_5X5_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 2UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c(3))[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c(4))[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_5X5_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h deleted file mode 100644 index 2d804d933..000000000 --- a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 7x7 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 3UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[7]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[7]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - src[3] = src_rows.at(border_offsets.c(3))[index]; - src[4] = src_rows.at(border_offsets.c(4))[index]; - src[5] = src_rows.at(border_offsets.c(5))[index]; - src[6] = src_rows.at(border_offsets.c(6))[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; - - BufferVectorType src_a[7], src_b[7]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - src_a[5] = vld1q(&src_5[0]); - src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); - src_a[6] = vld1q(&src_6[0]); - src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[7]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 7x7 separable filters driver type. -template -using SeparableFilter7x7 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h deleted file mode 100644 index eab3df4b1..000000000 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_SC_H -#define KLEIDICV_SEPARABLE_FILTER_7X7_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 7x7 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 3UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c(3))[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c(4))[index]); - SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c(5))[index]); - SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c(6))[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - BufferVectorType src_0_5 = svld1(pg, &src_5[0]); - BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); - BufferVectorType src_0_6 = svld1(pg, &src_6[0]); - BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, src_0_5, src_0_6, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); - BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); - BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 7x7 separable filters driver type. -template -using SeparableFilter7x7 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_7X7_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_driver_neon.h b/kleidicv/include/kleidicv/separable_filter_driver_neon.h new file mode 100644 index 000000000..3a684b7c8 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_driver_neon.h @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H + +#include "kleidicv/config.h" +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border.h" + +namespace kleidicv::neon { + +// Template for drivers of separable NxM filters. +template +class SeparableFilterDriver { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = typename neon::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilterDriver(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = KernelSize >> 1; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; + + loop.unroll_once([&](size_t index) { + vertical_vector_path(src_rows, dst_rows, border_offsets, index, seq); + }); + + loop.tail([&](size_t index) { + vertical_scalar_path(src_rows, dst_rows, border_offsets, index, seq); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; + + loop.unroll_twice([&](size_t index) { + horizontal_vector_path_2x(src_rows, dst_rows, border_offsets, index, seq); + }); + + loop.unroll_once([&](size_t index) { + horizontal_vector_path(src_rows, dst_rows, border_offsets, index, seq); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index, seq); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + constexpr auto seq = std::make_index_sequence{}; + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index, seq); + } + } + + private: + template + void vertical_vector_path(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + SourceVectorType src[KernelSize] = { + vld1q(&src_rows.at(border_offsets.c(SeqNum))[index])...}; + filter_.vertical_vector_path(src, &dst_rows[index]); + } + + template + void vertical_scalar_path(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + SourceType src[KernelSize] = { + src_rows.at(border_offsets.c(SeqNum))[index]...}; + filter_.vertical_scalar_path(src, &dst_rows[index]); + } + + template + void horizontal_vector_path_2x(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + BufferVectorType src_a[KernelSize] = { + vld1q(&src_rows.at(0, border_offsets.c(SeqNum))[index])...}; + BufferVectorType src_b[KernelSize] = {vld1q(&src_rows.at( + 0, border_offsets.c(SeqNum))[index + BufferVecTraits::num_lanes()])...}; + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + template + void horizontal_vector_path(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + BufferVectorType src[KernelSize] = { + vld1q(&src_rows.at(0, border_offsets.c(SeqNum))[index])...}; + filter_.horizontal_vector_path(src, &dst_rows[index]); + } + + template + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + BufferType src[KernelSize] = { + src_rows.at(0, border_offsets.c(SeqNum))[index]...}; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilterDriver + +} // namespace kleidicv::neon + +#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_driver_sc.h similarity index 52% rename from kleidicv/include/kleidicv/separable_filter_3x3_sc.h rename to kleidicv/include/kleidicv/separable_filter_driver_sc.h index 8c7e092ab..34e41b643 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_driver_sc.h @@ -1,10 +1,11 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_SC_H -#define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H +#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#define KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#include "kleidicv/config.h" #include "kleidicv/sve2.h" #include "kleidicv/workspace/border.h" @@ -12,12 +13,8 @@ namespace KLEIDICV_TARGET_NAMESPACE { // Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { +template +class SeparableFilterDriver { public: using SourceType = typename FilterType::SourceType; using BufferType = typename FilterType::BufferType; @@ -29,30 +26,33 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} + explicit SeparableFilterDriver(FilterType filter) + KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} - static constexpr size_t margin = 1UL; + static constexpr size_t margin = KernelSize >> 1; void process_vertical( size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index, + seq); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, + size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index, seq); + }); } void process_horizontal(size_t width, Rows src_rows, @@ -61,20 +61,23 @@ class SeparableFilter { KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); + index, seq); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index, + seq); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index, + seq); }); } @@ -90,72 +93,58 @@ class SeparableFilter { } private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + template + void vertical_vector_path( + svbool_t pg, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const KLEIDICV_STREAMING_COMPATIBLE { + filter_.vertical_vector_path( + pg, svld1(pg, &src_rows.at(border_offsets.c(SeqNum))[index])..., + &dst_rows[index]); } - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, - &dst_rows[index]); + template + void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const + KLEIDICV_STREAMING_COMPATIBLE { + filter_.horizontal_vector_path( + pg, svld1(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index])..., + &dst_rows[index]); + filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, + pg, + svld1_vnum(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index], 1)..., &dst_rows[index + BufferVecTraits::num_lanes()]); } + template void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index) const + BorderOffsets border_offsets, size_t index, + std::index_sequence) const KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, svld1(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index])..., + &dst_rows[index]); } void process_horizontal_border( Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; + BufferType src[KernelSize]; + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t i = 0; i < KernelSize; i++) { + src[i] = src_rows.at(0, border_offsets.c(i))[index]; + } filter_.horizontal_scalar_path(src, &dst_rows[index]); } FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; +}; // end of class SeparableFilterDriver } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SEPARABLE_FILTER_3X3_SC_H +#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index bf4d82aa1..e9092553c 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -8,10 +8,7 @@ #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_15x15_neon.h" -#include "kleidicv/separable_filter_3x3_neon.h" -#include "kleidicv/separable_filter_5x5_neon.h" -#include "kleidicv/separable_filter_7x7_neon.h" +#include "kleidicv/separable_filter_driver_neon.h" #include "kleidicv/sigma.h" namespace kleidicv::neon { @@ -627,7 +624,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; + SeparableFilterDriver filter{blur}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index e11fb8a58..9b0cdd809 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -8,10 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_15x15_sc.h" -#include "kleidicv/separable_filter_3x3_sc.h" -#include "kleidicv/separable_filter_5x5_sc.h" -#include "kleidicv/separable_filter_7x7_sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sigma.h" #include "kleidicv/sve2.h" @@ -819,7 +816,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; + SeparableFilterDriver filter{blur}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 8d3d0d3ed..9faacb3ee 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -8,7 +8,7 @@ #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_5x5_neon.h" +#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { @@ -139,7 +139,7 @@ kleidicv_error_t separable_filter_2d_u8( using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilter filter{filterClass}; + SeparableFilterDriver filter{filterClass}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 9ba9c9fb8..aa9fde64a 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_5x5_sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -159,7 +159,7 @@ static kleidicv_error_t separable_filter_2d_u8_sc( using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilter filter{filterClass}; + SeparableFilterDriver filter{filterClass}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 09e108575..96959ebd8 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -6,7 +6,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_3x3_neon.h" +#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { @@ -157,7 +157,8 @@ kleidicv_error_t sobel_3x3_horizontal_s16_u8(const uint8_t *src, } HorizontalSobel3x3 horizontal_sobel; - SeparableFilter3x3> filter{horizontal_sobel}; + SeparableFilterDriver, 3> filter{ + horizontal_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -193,7 +194,7 @@ kleidicv_error_t sobel_3x3_vertical_s16_u8(const uint8_t *src, } VerticalSobel3x3 vertical_sobel; - SeparableFilter3x3> filter{vertical_sobel}; + SeparableFilterDriver, 3> filter{vertical_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 783cd55ad..ab6a0c6e5 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_3x3_sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -148,7 +148,8 @@ static kleidicv_error_t sobel_3x3_horizontal_s16_u8_sc( } HorizontalSobel3x3 horizontal_sobel; - SeparableFilter3x3> filter{horizontal_sobel}; + SeparableFilterDriver, 3> filter{ + horizontal_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -184,7 +185,7 @@ static kleidicv_error_t sobel_3x3_vertical_s16_u8_sc( } VerticalSobel3x3 vertical_sobel; - SeparableFilter3x3> filter{vertical_sobel}; + SeparableFilterDriver, 3> filter{vertical_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; -- GitLab From 37fee7150decedf9664259970f397f94155df818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 19 Jul 2024 13:44:31 +0200 Subject: [PATCH 4/6] Refactor filenames --- kleidicv/include/kleidicv/debug.h | 2 +- ...eparable_filter_driver_neon.h => filter_driver_neon.h} | 6 +++--- .../{separable_filter_driver_sc.h => filter_driver_sc.h} | 8 ++++---- kleidicv/include/kleidicv/{sve2.h => sc.h} | 6 +++--- kleidicv/src/analysis/min_max_sc.h | 2 +- kleidicv/src/arithmetics/absdiff_sme2.cpp | 2 +- kleidicv/src/arithmetics/absdiff_sve2.cpp | 2 +- kleidicv/src/arithmetics/add_abs_with_threshold_sc.h | 2 +- kleidicv/src/arithmetics/add_sme2.cpp | 2 +- kleidicv/src/arithmetics/add_sve2.cpp | 2 +- kleidicv/src/arithmetics/compare_sc.h | 2 +- kleidicv/src/arithmetics/exp_sc.h | 2 +- kleidicv/src/arithmetics/multiply_sve2.cpp | 2 +- kleidicv/src/arithmetics/scale_sc.h | 2 +- kleidicv/src/arithmetics/sub_sme2.cpp | 2 +- kleidicv/src/arithmetics/sub_sve2.cpp | 2 +- kleidicv/src/arithmetics/threshold_sc.h | 2 +- kleidicv/src/conversions/float_conv_sc.h | 2 +- kleidicv/src/conversions/gray_to_rgb_sc.h | 2 +- kleidicv/src/conversions/rgb_to_rgb_sc.h | 2 +- kleidicv/src/conversions/rgb_to_yuv_sc.h | 2 +- kleidicv/src/conversions/yuv_sp_to_rgb_sc.h | 2 +- kleidicv/src/conversions/yuv_to_rgb_sc.h | 2 +- kleidicv/src/filters/gaussian_blur_neon.cpp | 2 +- kleidicv/src/filters/gaussian_blur_sc.h | 4 ++-- kleidicv/src/filters/separable_filter_2d_neon.cpp | 2 +- kleidicv/src/filters/separable_filter_2d_sc.h | 4 ++-- kleidicv/src/filters/sobel_neon.cpp | 2 +- kleidicv/src/filters/sobel_sc.h | 4 ++-- kleidicv/src/logical/bitwise_and_sc.h | 2 +- kleidicv/src/morphology/morphology_sc.h | 2 +- kleidicv/src/resize/resize_linear_sc.h | 2 +- kleidicv/src/resize/resize_sc.h | 2 +- 33 files changed, 43 insertions(+), 43 deletions(-) rename kleidicv/include/kleidicv/{separable_filter_driver_neon.h => filter_driver_neon.h} (97%) rename kleidicv/include/kleidicv/{separable_filter_driver_sc.h => filter_driver_sc.h} (97%) rename kleidicv/include/kleidicv/{sve2.h => sc.h} (99%) diff --git a/kleidicv/include/kleidicv/debug.h b/kleidicv/include/kleidicv/debug.h index 3de11595f..1c0c92cb1 100644 --- a/kleidicv/include/kleidicv/debug.h +++ b/kleidicv/include/kleidicv/debug.h @@ -9,7 +9,7 @@ #if KLEIDICV_TARGET_NEON #include "kleidicv/neon.h" #elif KLEIDICV_TARGET_SVE2 -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" #endif #include diff --git a/kleidicv/include/kleidicv/separable_filter_driver_neon.h b/kleidicv/include/kleidicv/filter_driver_neon.h similarity index 97% rename from kleidicv/include/kleidicv/separable_filter_driver_neon.h rename to kleidicv/include/kleidicv/filter_driver_neon.h index 3a684b7c8..daacde7af 100644 --- a/kleidicv/include/kleidicv/separable_filter_driver_neon.h +++ b/kleidicv/include/kleidicv/filter_driver_neon.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H +#ifndef KLEIDICV_FILTER_DRIVER_NEON_H +#define KLEIDICV_FILTER_DRIVER_NEON_H #include "kleidicv/config.h" #include "kleidicv/neon.h" @@ -137,4 +137,4 @@ class SeparableFilterDriver { } // namespace kleidicv::neon -#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H +#endif // KLEIDICV_FILTER_DRIVER_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_driver_sc.h b/kleidicv/include/kleidicv/filter_driver_sc.h similarity index 97% rename from kleidicv/include/kleidicv/separable_filter_driver_sc.h rename to kleidicv/include/kleidicv/filter_driver_sc.h index 34e41b643..6bbb96eee 100644 --- a/kleidicv/include/kleidicv/separable_filter_driver_sc.h +++ b/kleidicv/include/kleidicv/filter_driver_sc.h @@ -2,11 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H -#define KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#ifndef KLEIDICV_FILTER_DRIVER_SC_H +#define KLEIDICV_FILTER_DRIVER_SC_H #include "kleidicv/config.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" #include "kleidicv/workspace/border.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. @@ -147,4 +147,4 @@ class SeparableFilterDriver { } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#endif // KLEIDICV_FILTER_DRIVER_SC_H diff --git a/kleidicv/include/kleidicv/sve2.h b/kleidicv/include/kleidicv/sc.h similarity index 99% rename from kleidicv/include/kleidicv/sve2.h rename to kleidicv/include/kleidicv/sc.h index 8f656d7d7..55f45f955 100644 --- a/kleidicv/include/kleidicv/sve2.h +++ b/kleidicv/include/kleidicv/sc.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SVE2_H -#define KLEIDICV_SVE2_H +#ifndef KLEIDICV_SC_H +#define KLEIDICV_SC_H #include @@ -511,4 +511,4 @@ static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING_COMPATIBLE { } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SVE2_H +#endif // KLEIDICV_SC_H diff --git a/kleidicv/src/analysis/min_max_sc.h b/kleidicv/src/analysis/min_max_sc.h index a42ebdff8..a294cf783 100644 --- a/kleidicv/src/analysis/min_max_sc.h +++ b/kleidicv/src/analysis/min_max_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/absdiff_sme2.cpp b/kleidicv/src/arithmetics/absdiff_sme2.cpp index 9bf7c2184..00323a852 100644 --- a/kleidicv/src/arithmetics/absdiff_sme2.cpp +++ b/kleidicv/src/arithmetics/absdiff_sme2.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sme2 { diff --git a/kleidicv/src/arithmetics/absdiff_sve2.cpp b/kleidicv/src/arithmetics/absdiff_sve2.cpp index 21e0eac79..6d1877d3e 100644 --- a/kleidicv/src/arithmetics/absdiff_sve2.cpp +++ b/kleidicv/src/arithmetics/absdiff_sve2.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h b/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h index 3b8075b68..2dc36a106 100644 --- a/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h +++ b/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/add_sme2.cpp b/kleidicv/src/arithmetics/add_sme2.cpp index 7f21ffa7d..ccd1b4a6e 100644 --- a/kleidicv/src/arithmetics/add_sme2.cpp +++ b/kleidicv/src/arithmetics/add_sme2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sme2 { diff --git a/kleidicv/src/arithmetics/add_sve2.cpp b/kleidicv/src/arithmetics/add_sve2.cpp index 670237eb6..32a41dbad 100644 --- a/kleidicv/src/arithmetics/add_sve2.cpp +++ b/kleidicv/src/arithmetics/add_sve2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/compare_sc.h b/kleidicv/src/arithmetics/compare_sc.h index d18287f8f..e7d8f9d48 100644 --- a/kleidicv/src/arithmetics/compare_sc.h +++ b/kleidicv/src/arithmetics/compare_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_COMPARE_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/exp_sc.h b/kleidicv/src/arithmetics/exp_sc.h index 41f9f813e..02ff145fc 100644 --- a/kleidicv/src/arithmetics/exp_sc.h +++ b/kleidicv/src/arithmetics/exp_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/arithmetics/exp_constants.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { template diff --git a/kleidicv/src/arithmetics/multiply_sve2.cpp b/kleidicv/src/arithmetics/multiply_sve2.cpp index 7edd481fb..fed1f1440 100644 --- a/kleidicv/src/arithmetics/multiply_sve2.cpp +++ b/kleidicv/src/arithmetics/multiply_sve2.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/scale_sc.h b/kleidicv/src/arithmetics/scale_sc.h index 269c15b4c..c8eb64fe7 100644 --- a/kleidicv/src/arithmetics/scale_sc.h +++ b/kleidicv/src/arithmetics/scale_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SCALE_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/sub_sme2.cpp b/kleidicv/src/arithmetics/sub_sme2.cpp index 1fe21043d..4f9b8f8e9 100644 --- a/kleidicv/src/arithmetics/sub_sme2.cpp +++ b/kleidicv/src/arithmetics/sub_sme2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sme2 { diff --git a/kleidicv/src/arithmetics/sub_sve2.cpp b/kleidicv/src/arithmetics/sub_sve2.cpp index 43bdb97e3..6be0690c3 100644 --- a/kleidicv/src/arithmetics/sub_sve2.cpp +++ b/kleidicv/src/arithmetics/sub_sve2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/threshold_sc.h b/kleidicv/src/arithmetics/threshold_sc.h index 2868b9b27..f0ac017f7 100644 --- a/kleidicv/src/arithmetics/threshold_sc.h +++ b/kleidicv/src/arithmetics/threshold_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_THRESHOLD_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/float_conv_sc.h b/kleidicv/src/conversions/float_conv_sc.h index d62bec2fd..c98c955e3 100644 --- a/kleidicv/src/conversions/float_conv_sc.h +++ b/kleidicv/src/conversions/float_conv_sc.h @@ -9,7 +9,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/gray_to_rgb_sc.h b/kleidicv/src/conversions/gray_to_rgb_sc.h index a0a1c7c24..ab4f49ae4 100644 --- a/kleidicv/src/conversions/gray_to_rgb_sc.h +++ b/kleidicv/src/conversions/gray_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/gray_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/rgb_to_rgb_sc.h b/kleidicv/src/conversions/rgb_to_rgb_sc.h index 8ec8cda39..f3d58ee29 100644 --- a/kleidicv/src/conversions/rgb_to_rgb_sc.h +++ b/kleidicv/src/conversions/rgb_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/rgb_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/rgb_to_yuv_sc.h b/kleidicv/src/conversions/rgb_to_yuv_sc.h index 99059a6df..2d5026792 100644 --- a/kleidicv/src/conversions/rgb_to_yuv_sc.h +++ b/kleidicv/src/conversions/rgb_to_yuv_sc.h @@ -10,7 +10,7 @@ #include "kleidicv/conversions/rgb_to_yuv.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h index 8998e54bb..bec3a2cc4 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/yuv_sp_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/yuv_to_rgb_sc.h b/kleidicv/src/conversions/yuv_to_rgb_sc.h index a2ceaf9cf..0af5d788f 100644 --- a/kleidicv/src/conversions/yuv_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/yuv_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index e9092553c..13ee1b08c 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -5,10 +5,10 @@ #include #include "kleidicv/ctypes.h" +#include "kleidicv/filter_driver_neon.h" #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_driver_neon.h" #include "kleidicv/sigma.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 9b0cdd809..f5b6b0cdb 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -7,10 +7,10 @@ #include +#include "kleidicv/filter_driver_sc.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_driver_sc.h" +#include "kleidicv/sc.h" #include "kleidicv/sigma.h" -#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 9faacb3ee..be40208aa 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -5,10 +5,10 @@ #include #include "kleidicv/ctypes.h" +#include "kleidicv/filter_driver_neon.h" #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index aa9fde64a..9a18321f2 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -7,9 +7,9 @@ #include +#include "kleidicv/filter_driver_sc.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_driver_sc.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 96959ebd8..cd5b7bf66 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -2,11 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "kleidicv/filter_driver_neon.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index ab6a0c6e5..4b9f0518a 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -5,10 +5,10 @@ #ifndef KLEIDICV_SOBEL_SC_H #define KLEIDICV_SOBEL_SC_H +#include "kleidicv/filter_driver_sc.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_driver_sc.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/logical/bitwise_and_sc.h b/kleidicv/src/logical/bitwise_and_sc.h index 86ccf588b..3afe7b679 100644 --- a/kleidicv/src/logical/bitwise_and_sc.h +++ b/kleidicv/src/logical/bitwise_and_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_BITWISE_AND_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/morphology/morphology_sc.h b/kleidicv/src/morphology/morphology_sc.h index bce498bb2..e86af164f 100644 --- a/kleidicv/src/morphology/morphology_sc.h +++ b/kleidicv/src/morphology/morphology_sc.h @@ -10,7 +10,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" #include "kleidicv/types.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index c43b1e497..53a80b5e1 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/resize/resize_sc.h b/kleidicv/src/resize/resize_sc.h index b67d8bc66..d94e16ce1 100644 --- a/kleidicv/src/resize/resize_sc.h +++ b/kleidicv/src/resize/resize_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_RESIZE_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "kleidicv/sc.h" namespace KLEIDICV_TARGET_NAMESPACE { -- GitLab From c29297e328ce9b0807ba3928cfcd488a4109bc19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Fri, 26 Jul 2024 17:10:26 +0200 Subject: [PATCH 5/6] Support arbitrary kernel sizes in Separable Filter 2D --- .../include/kleidicv/filter_driver_neon.h | 118 +++++++++++ kleidicv/include/kleidicv/workspace/border.h | 185 ++++++++++++++++++ .../include/kleidicv/workspace/separable.h | 79 +++++++- .../src/filters/separable_filter_2d_neon.cpp | 179 ++++++++++++++--- test/api/test_separable_filter_2d.cpp | 42 ++++ 5 files changed, 579 insertions(+), 24 deletions(-) diff --git a/kleidicv/include/kleidicv/filter_driver_neon.h b/kleidicv/include/kleidicv/filter_driver_neon.h index daacde7af..dcf960223 100644 --- a/kleidicv/include/kleidicv/filter_driver_neon.h +++ b/kleidicv/include/kleidicv/filter_driver_neon.h @@ -12,6 +12,13 @@ namespace kleidicv::neon { // Template for drivers of separable NxM filters. +template +class SeparableFilterDriver; + +// Template for drivers of separable NxM filters with arbitrary kernel sizes. +template +class SeparableFilterDriverArbitrary; + template class SeparableFilterDriver { public: @@ -135,6 +142,117 @@ class SeparableFilterDriver { FilterType filter_; }; // end of class SeparableFilterDriver +template +class SeparableFilterDriverArbitrary { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = typename neon::DynamicBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilterDriverArbitrary(FilterType &filter, + size_t kernel_size) + : margin(kernel_size >> 1), filter_{filter}, kernel_size_(kernel_size) {} + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(SourceVectorType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = vld1q(&src_rows.at(border_offsets.c(i))[index]); + } + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(SourceType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = src_rows.at(border_offsets.c(i))[index]; + } + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + BufferVectorType *src_a = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferVectorType))); + BufferVectorType *src_b = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferVectorType))); + + for (size_t i = 0; i < kernel_size_; i++) { + src_a[i] = vld1q(&src_rows.at(0, border_offsets.c(i))[index]); + } + + for (size_t i = 0; i < kernel_size_; i++) { + src_b[i] = vld1q(&src_rows.at( + 0, border_offsets.c(i))[index + BufferVecTraits::num_lanes()]); + } + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferVectorType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = vld1q(&src_rows.at(0, border_offsets.c(i))[index]); + } + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + const size_t margin; + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets &border_offsets, + size_t index) const { + BufferType *src = static_cast( + __builtin_alloca(kernel_size_ * sizeof(BufferType))); + for (size_t i = 0; i < kernel_size_; i++) { + src[i] = src_rows.at(0, border_offsets.c(i))[index]; + } + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; + const size_t kernel_size_; +}; // end of class SeparableFilterDriverArbitrary + } // namespace kleidicv::neon #endif // KLEIDICV_FILTER_DRIVER_NEON_H diff --git a/kleidicv/include/kleidicv/workspace/border.h b/kleidicv/include/kleidicv/workspace/border.h index 35d5f5415..65a36c9bf 100644 --- a/kleidicv/include/kleidicv/workspace/border.h +++ b/kleidicv/include/kleidicv/workspace/border.h @@ -14,6 +14,10 @@ namespace KLEIDICV_TARGET_NAMESPACE { template class FixedBorderInfo; +// Border offsets for dynamically-sized filters. +template +class DynamicBorderInfo; + template class FixedBorderInfo final { public: @@ -201,6 +205,187 @@ class FixedBorderInfo final { FixedBorderType border_type_; }; // end of class FixedBorderInfo +template +class DynamicBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + Offsets() = delete; + Offsets(const Offsets&) = delete; + Offsets& operator=(const Offsets&) = delete; + Offsets(Offsets&& other) noexcept : offsets_(other.offsets_) { + other.offsets_ = nullptr; + } + + explicit Offsets(size_t kernel_size) : offsets_(new size_t[kernel_size]) {} + ~Offsets() { delete[] offsets_; } + + size_t c(int i) const { return offsets_ ? offsets_[i] : 0; } + + private: + friend class DynamicBorderInfo; + size_t* offsets_; + }; + + DynamicBorderInfo(size_t height, FixedBorderType border_type, + size_t kernel_size) + : height_(height), + border_type_(border_type), + kernel_size_(kernel_size), + half_kernel_size_(static_cast(kernel_size >> 1)) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get_no_border(); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + return get_border(column_index); + break; + + case FixedBorderType::REFLECT: + return get_border(column_index); + break; + + case FixedBorderType::WRAP: + return get_border(column_index); + break; + + case FixedBorderType::REVERSE: + return get_border(column_index); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{kernel_size_}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + column_index = height_ - column_index - 1; + switch (border_type_) { + case FixedBorderType::REPLICATE: + return get_border(column_index); + break; + + case FixedBorderType::REFLECT: + return get_border(column_index); + break; + + case FixedBorderType::WRAP: + return get_border(column_index); + break; + + case FixedBorderType::REVERSE: + return get_border(column_index); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{kernel_size_}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index < static_cast(half_kernel_size_)) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= + (height_ - static_cast(half_kernel_size_))) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Creates the Offsets object containing offsets in the interval + // [-(KernelSize / 2), KernelSize / 2]. + inline Offsets get_no_border() const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets{kernel_size_}; + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[i] = i - half_kernel_size_; + } + offsets.offsets_[half_kernel_size_] = 0; + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[half_kernel_size_ + 1 + i] = i + 1; + } + return offsets; + } + + // Creates the Offsets object containing offsets in various intervals + // depending on the column, border type as well the border position used. + // + // For examples, refer to the static implementation in class FixedBorderInfo. + // NOLINTBEGIN(readability-function-cognitive-complexity) + template + inline Offsets get_border(int column) const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets{kernel_size_}; + + if constexpr (!IsRight) { + for (int i = 0; i < half_kernel_size_; i++) { + if (i - half_kernel_size_ < -column) { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + offsets.offsets_[i] = -column; + } else if constexpr (BorderType == FixedBorderType::REFLECT) { + offsets.offsets_[i] = half_kernel_size_ - (column << 1) - (i + 1); + } else if constexpr (BorderType == FixedBorderType::WRAP) { + offsets.offsets_[i] = i - half_kernel_size_ + height_; + } else if constexpr (BorderType == FixedBorderType::REVERSE) { + offsets.offsets_[i] = half_kernel_size_ - (column << 1) - i; + } + continue; + } + offsets.offsets_[i] = i - half_kernel_size_; + } + offsets.offsets_[half_kernel_size_] = 0; + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[half_kernel_size_ + 1 + i] = i + 1; + } + } + + if constexpr (IsRight) { + for (int i = 0; i < half_kernel_size_; i++) { + offsets.offsets_[i] = i - half_kernel_size_; + } + offsets.offsets_[half_kernel_size_] = 0; + for (int i = 0; i < half_kernel_size_; i++) { + if (i >= column) { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + offsets.offsets_[half_kernel_size_ + 1 + i] = column; + } else if constexpr (BorderType == FixedBorderType::REFLECT) { + offsets.offsets_[half_kernel_size_ + 1 + i] = (column << 1) - i; + } else if constexpr (BorderType == FixedBorderType::WRAP) { + offsets.offsets_[half_kernel_size_ + 1 + i] = i - height_ + 1; + } else if constexpr (BorderType == FixedBorderType::REVERSE) { + offsets.offsets_[half_kernel_size_ + 1 + i] = + (column << 1) - (i + 1); + } + continue; + } + offsets.offsets_[half_kernel_size_ + 1 + i] = i + 1; + } + } + + return offsets; + } + // NOLINTEND(readability-function-cognitive-complexity) + + size_t height_; + FixedBorderType border_type_; + + size_t kernel_size_; + int half_kernel_size_; +}; // end of class DynamicBorderInfo + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_WORKSPACE_BORDER_H diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 6a501686e..719302678 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -16,6 +16,7 @@ namespace KLEIDICV_TARGET_NAMESPACE { // Forward declarations. class SeparableFilterWorkspace; +class SeparableFilterWorkspaceDynamic; // Deleter for SeparableFilterWorkspace instances. class SeparableFilterWorkspaceDeleter { @@ -67,7 +68,7 @@ class SeparableFilterWorkspaceDeleter { // // Handling of borders is calculated based on offsets rather than setting up // suitably-sized buffers which could hold both borders and data. -class SeparableFilterWorkspace final { +class SeparableFilterWorkspace { public: // To avoid load/store penalties. static constexpr size_t kAlignment = 16UL; @@ -194,6 +195,7 @@ class SeparableFilterWorkspace final { } } + protected: // Offset in bytes to the buffer rows from &data_[0]. size_t buffer_rows_offset_; // Stride of the buffer rows. @@ -207,6 +209,81 @@ class SeparableFilterWorkspace final { uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment); }; // end of class SeparableFilterWorkspace +class SeparableFilterWorkspaceDynamic : SeparableFilterWorkspace { + public: + // Processes rows vertically first along the full width + template + void process(Rectangle rect, + Rows src_rows, + Rows dst_rows, + size_t channels, typename FilterType::BorderType border_type, + size_t kernel_size, + FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + // Border helper which calculates border offsets. + typename FilterType::BorderInfoType vertical_border{ + rect.height(), border_type, kernel_size}; + typename FilterType::BorderInfoType horizontal_border{ + rect.width(), border_type, kernel_size}; + + // Buffer rows which hold intermediate widened data. + auto buffer_rows = Rows{reinterpret_cast( + &data_[buffer_rows_offset_]), + buffer_rows_stride_, channels}; + + // Vertical processing loop. + for (size_t vertical_index = 0; vertical_index < rect.height(); + ++vertical_index) { + // Recalculate vertical border offsets. + auto offsets = vertical_border.offsets_with_border(vertical_index); + // Process in the vertical direction first. + filter.process_vertical(rect.width(), src_rows.at(vertical_index), + buffer_rows, offsets); + // Process in the horizontal direction last. + process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), + filter, horizontal_border); + } + } + + template + void process_horizontal(size_t width, + Rows buffer_rows, + Rows dst_rows, + FilterType filter, + typename FilterType::BorderInfoType horizontal_border) + KLEIDICV_STREAMING_COMPATIBLE { + // Margin associated with the filter. + size_t margin = filter.margin; + + // Process data affected by left border. + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + auto offsets = + horizontal_border.offsets_with_left_border(horizontal_index); + filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), + dst_rows.at(0, horizontal_index), + offsets); + } + + // Process data which is not affected by any borders in bulk. + { + size_t width_without_borders = width - (2 * margin); + auto offsets = horizontal_border.offsets_without_border(); + filter.process_horizontal(width_without_borders, + buffer_rows.at(0, margin), + dst_rows.at(0, margin), offsets); + } + + // Process data affected by right border. + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + size_t index = width - margin + horizontal_index; + auto offsets = horizontal_border.offsets_with_right_border(index); + filter.process_horizontal_borders(buffer_rows.at(0, index), + dst_rows.at(0, index), offsets); + } + } +}; // end of class SeparableFilterWorkspaceDynamic + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_WORKSPACE_SEPARABLE_H diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index be40208aa..9f2a66123 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -15,8 +15,11 @@ namespace kleidicv::neon { template class SeparableFilter2D; -template <> -class SeparableFilter2D { +template +class SeparableFilter2DArbitrary; + +template +class SeparableFilter2D { public: using SourceType = uint8_t; using BufferType = uint8_t; @@ -25,25 +28,27 @@ class SeparableFilter2D { explicit SeparableFilter2D(const uint8_t *kernel_x, const uint8_t *kernel_y) : kernel_x_(kernel_x), kernel_y_(kernel_y) {} - void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const { + void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { this->vector_path_with_kernel(src, dst, kernel_y_); } - void vertical_scalar_path(const SourceType src[5], BufferType *dst) const { + void vertical_scalar_path(const SourceType src[KernelSize], + BufferType *dst) const { this->scalar_path_with_kernel(src, dst, kernel_y_); } - void horizontal_vector_path(uint8x16_t src[5], DestinationType *dst) const { + void horizontal_vector_path(uint8x16_t src[KernelSize], + DestinationType *dst) const { this->vector_path_with_kernel(src, dst, kernel_x_); } - void horizontal_scalar_path(const BufferType src[5], + void horizontal_scalar_path(const BufferType src[KernelSize], DestinationType *dst) const { this->scalar_path_with_kernel(src, dst, kernel_x_); } private: - void vector_path_with_kernel(uint8x16_t src[5], uint8_t *dst, + void vector_path_with_kernel(uint8x16_t src[KernelSize], uint8_t *dst, const uint8_t *kernel) const { uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); @@ -53,7 +58,7 @@ class SeparableFilter2D { // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 1; i < 5; i++) { + for (size_t i = 1; i < KernelSize; i++) { uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); @@ -67,7 +72,7 @@ class SeparableFilter2D { vst1q_u8(&dst[0], result); } - void scalar_path_with_kernel(const uint8_t src[5], uint8_t *dst, + void scalar_path_with_kernel(const uint8_t src[KernelSize], uint8_t *dst, const uint8_t *kernel) const { uint8_t acc; // NOLINT if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { @@ -75,7 +80,7 @@ class SeparableFilter2D { return; } - for (size_t i = 1; i < 5; i++) { + for (size_t i = 1; i < KernelSize; i++) { uint8_t temp; // NOLINT if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { dst[0] = std::numeric_limits::max(); @@ -92,7 +97,143 @@ class SeparableFilter2D { const uint8_t *kernel_x_; const uint8_t *kernel_y_; -}; +}; // end of class SeparableFilter2D + +template <> +class SeparableFilter2DArbitrary { + public: + using SourceType = uint8_t; + using BufferType = uint8_t; + using DestinationType = uint8_t; + + explicit SeparableFilter2DArbitrary(const uint8_t *kernel_x, + const uint8_t *kernel_y, + const size_t kernel_size) + : kernel_x_(kernel_x), kernel_y_(kernel_y), kernel_size_(kernel_size) {} + + void vertical_vector_path(uint8x16_t src[], BufferType *dst) const { + this->vector_path_with_kernel(src, dst, kernel_y_); + } + + void vertical_scalar_path(const SourceType src[], BufferType *dst) const { + this->scalar_path_with_kernel(src, dst, kernel_y_); + } + + void horizontal_vector_path(uint8x16_t src[], DestinationType *dst) const { + this->vector_path_with_kernel(src, dst, kernel_x_); + } + + void horizontal_scalar_path(const BufferType src[], + DestinationType *dst) const { + this->scalar_path_with_kernel(src, dst, kernel_x_); + } + + private: + void vector_path_with_kernel(uint8x16_t src[], uint8_t *dst, + const uint8_t *kernel) const { + uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); + uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); + + acc_l = vmulq_n_u16(acc_l, kernel[0]); + acc_h = vmulq_n_u16(acc_h, kernel[0]); + + for (size_t i = 1; i < kernel_size_; i++) { + uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); + uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); + + acc_l = vmlaq_n_u16(acc_l, vec_l, kernel[i]); + acc_h = vmlaq_n_u16(acc_h, vec_h, kernel[i]); + } + + uint8x8_t result_l = vqmovn_u16(acc_l); + uint8x16_t result = vqmovn_high_u16(result_l, acc_h); + + vst1q_u8(&dst[0], result); + } + + void scalar_path_with_kernel(const uint8_t src[], uint8_t *dst, + const uint8_t *kernel) const { + uint8_t acc; // NOLINT + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { + dst[0] = std::numeric_limits::max(); + return; + } + + for (size_t i = 1; i < kernel_size_; i++) { + uint8_t temp; // NOLINT + if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { + dst[0] = std::numeric_limits::max(); + return; + } + if (__builtin_add_overflow(acc, temp, &acc)) { + dst[0] = std::numeric_limits::max(); + return; + } + } + + dst[0] = acc; + } + + const uint8_t *kernel_x_; + const uint8_t *kernel_y_; + const size_t kernel_size_; +}; // end of class SeparableFilter2DArbitrary + +template +static kleidicv_error_t separable_filter_2d_fixed_kernel_size( + Rectangle &rect, Rows &src_rows, + Rows &dst_rows, const ScalarType *kernel_x, + const ScalarType *kernel_y, size_t channels, FixedBorderType border_type, + SeparableFilterWorkspace *workspace) { + using SeparableFilterClass = SeparableFilter2D; + + SeparableFilterClass filterClass{kernel_x, kernel_y}; + SeparableFilterDriver filter{filterClass}; + workspace->process(rect, src_rows, dst_rows, channels, border_type, filter); + return KLEIDICV_OK; +} + +template +static kleidicv_error_t separable_filter_2d( + const ScalarType *src, size_t src_stride, ScalarType *dst, + size_t dst_stride, Rectangle &rect, size_t channels, + const ScalarType *kernel_x, const ScalarType *kernel_y, size_t kernel_size, + FixedBorderType border_type, SeparableFilterWorkspace *workspace) { + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + + switch (kernel_size) { + case 3: + return separable_filter_2d_fixed_kernel_size<3>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + case 5: + return separable_filter_2d_fixed_kernel_size<5>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + case 7: + return separable_filter_2d_fixed_kernel_size<7>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + case 15: + return separable_filter_2d_fixed_kernel_size<15>( + rect, src_rows, dst_rows, kernel_x, kernel_y, channels, border_type, + workspace); + default: + break; + } + + using SeparableFilterClass = SeparableFilter2DArbitrary; + + SeparableFilterClass filterClass{kernel_x, kernel_y, kernel_size}; + SeparableFilterDriverArbitrary filter{filterClass, + kernel_size}; + reinterpret_cast(workspace)->process( + rect, src_rows, dst_rows, channels, border_type, kernel_size, filter); + + return KLEIDICV_OK; +} KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t separable_filter_2d_u8( @@ -104,7 +245,7 @@ kleidicv_error_t separable_filter_2d_u8( auto *workspace = reinterpret_cast(context); auto fixed_border_type = get_fixed_border_type(border_type); - if (kernel_width != 5 || kernel_height != 5) { + if (kernel_width != kernel_height) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } @@ -136,17 +277,9 @@ kleidicv_error_t separable_filter_2d_u8( return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - using SeparableFilterClass = SeparableFilter2D; - - SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilterDriver filter{filterClass}; - - Rows src_rows{src, src_stride, channels}; - Rows dst_rows{dst, dst_stride, channels}; - workspace->process(rect, src_rows, dst_rows, channels, *fixed_border_type, - filter); - - return KLEIDICV_OK; + return separable_filter_2d(src, src_stride, dst, dst_stride, rect, channels, + kernel_x, kernel_y, kernel_width, + *fixed_border_type, workspace); } } // namespace kleidicv::neon diff --git a/test/api/test_separable_filter_2d.cpp b/test/api/test_separable_filter_2d.cpp index 4d440b08c..7ffbfa9df 100644 --- a/test/api/test_separable_filter_2d.cpp +++ b/test/api/test_separable_filter_2d.cpp @@ -193,6 +193,48 @@ TYPED_TEST(SeparableFilter2D, 5x5Overflow) { EXPECT_EQ_ARRAY2D(dst_expected, dst); } +TYPED_TEST(SeparableFilter2D, Arbitrary9x9) { + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 9, 9, 9, 9)); + test::Array2D src{9, 9, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 1, 2, 3, 4, 5, 6, 7, 8, 9}); + src.set(1, 0, { 2, 3, 4, 5, 6, 7, 8, 9, 1}); + src.set(2, 0, { 3, 4, 5, 6, 7, 8, 9, 1, 2}); + src.set(3, 0, { 4, 5, 6, 7, 8, 9, 1, 2, 3}); + src.set(4, 0, { 5, 6, 7, 8, 9, 1, 2, 3, 4}); + src.set(5, 0, { 6, 7, 8, 9, 1, 2, 3, 4, 5}); + src.set(6, 0, { 7, 8, 9, 1, 2, 3, 4, 5, 6}); + src.set(7, 0, { 8, 9, 1, 2, 3, 4, 5, 6, 7}); + src.set(8, 0, { 9, 1, 2, 3, 4, 5, 6, 7, 8}); + // clang-format on + + test::Array2D kernel{9, 1}; + kernel.set(0, 0, {1, 0, 1, 0, 1, 0, 1, 0, 1}); + + test::Array2D dst{9, 9, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 9, 9, 1, kernel.data(), 9, kernel.data(), 9, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + + test::Array2D dst_expected{9, 9, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 85, 91, 106, 108, 128, 130, 150, 147, 162}); + dst_expected.set(1, 0, { 91, 106, 103, 123, 116, 136, 129, 144, 132}); + dst_expected.set(2, 0, { 106, 103, 118, 111, 131, 124, 144, 132, 147}); + dst_expected.set(3, 0, { 108, 123, 111, 131, 115, 135, 119, 134, 122}); + dst_expected.set(4, 0, { 128, 116, 131, 115, 135, 119, 139, 127, 142}); + dst_expected.set(5, 0, { 130, 136, 124, 135, 119, 130, 114, 129, 117}); + dst_expected.set(6, 0, { 150, 129, 144, 119, 139, 114, 134, 122, 137}); + dst_expected.set(7, 0, { 147, 144, 132, 134, 127, 129, 122, 137, 134}); + dst_expected.set(8, 0, { 162, 132, 147, 122, 142, 117, 137, 134, 149}); + // clang-format on + EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + TYPED_TEST(SeparableFilter2D, NullPointer) { using KernelTestParams = SeparableFilter2DKernelTestParams; kleidicv_filter_context_t *context = nullptr; -- GitLab From a796c71d0eeb9b2f2ee3551ec92da083ba4c2b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Mon, 29 Jul 2024 17:42:08 +0200 Subject: [PATCH 6/6] Support arbitrary types in Separable Filter 2D So far, only one new type has been added (uint16_t). --- .../kleidicv/filters/separable_filter_2d.h | 18 ++ kleidicv/include/kleidicv/kleidicv.h | 8 + kleidicv/include/kleidicv/neon.h | 24 +++ kleidicv/include/kleidicv/neon_intrinsics.h | 32 ++++ kleidicv/include/kleidicv/traits.h | 7 + .../src/filters/separable_filter_2d_api.cpp | 5 + .../src/filters/separable_filter_2d_neon.cpp | 180 +++++++++++------- kleidicv/src/filters/separable_filter_2d_sc.h | 7 + .../src/filters/separable_filter_2d_sme2.cpp | 13 ++ .../src/filters/separable_filter_2d_sve2.cpp | 11 ++ test/api/test_separable_filter_2d.cpp | 43 +++++ 11 files changed, 278 insertions(+), 70 deletions(-) diff --git a/kleidicv/include/kleidicv/filters/separable_filter_2d.h b/kleidicv/include/kleidicv/filters/separable_filter_2d.h index 514d88a5a..9e3e48209 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_2d.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_2d.h @@ -18,6 +18,12 @@ kleidicv_error_t separable_filter_2d_u8( size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t separable_filter_2d_u16( + const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint16_t *kernel_x, + size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); + } // namespace neon namespace sve2 { @@ -28,6 +34,12 @@ kleidicv_error_t separable_filter_2d_u8( size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t separable_filter_2d_u16( + const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint16_t *kernel_x, + size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); + } // namespace sve2 namespace sme2 { @@ -38,6 +50,12 @@ kleidicv_error_t separable_filter_2d_u8( size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +kleidicv_error_t separable_filter_2d_u16( + const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint16_t *kernel_x, + size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); + } // namespace sme2 } // namespace kleidicv diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 54d8cc4bc..889536a9a 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1274,6 +1274,14 @@ KLEIDICV_API_DECLARATION(kleidicv_separable_filter_2d_u8, const uint8_t *src, const uint8_t *kernel_y, size_t kernel_height, kleidicv_border_type_t border_type, kleidicv_filter_context_t *context); +/// @copydoc kleidicv_separable_filter_2d_u8 +KLEIDICV_API_DECLARATION(kleidicv_separable_filter_2d_u16, const uint16_t *src, + size_t src_stride, uint16_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, + const uint16_t *kernel_x, size_t kernel_width, + const uint16_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context); /// Applies Gaussian blur to the source image using the specified parameters. /// In-place filtering is not supported. diff --git a/kleidicv/include/kleidicv/neon.h b/kleidicv/include/kleidicv/neon.h index c5a59dc1a..0cda19cd4 100644 --- a/kleidicv/include/kleidicv/neon.h +++ b/kleidicv/include/kleidicv/neon.h @@ -50,6 +50,30 @@ class double_element_width { using type = uint64x2_t; }; +template <> +class element_half_width { + public: + using type = uint8x8_t; +}; + +template <> +class element_half_width { + public: + using type = uint16x4_t; +}; + +template <> +class element_half_width { + public: + using type = uint32x2_t; +}; + +template <> +class element_half_width { + public: + using type = uint64x1_t; +}; + // Primary template to describe logically grouped peroperties of vectors. template class VectorTypes; diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h index 0add00459..8c69cd681 100644 --- a/kleidicv/include/kleidicv/neon_intrinsics.h +++ b/kleidicv/include/kleidicv/neon_intrinsics.h @@ -251,6 +251,27 @@ static inline int32x4_t vmull_high(int16x8_t lhs, int16x8_t rhs) { return vmu static inline uint32x4_t vmull_high(uint16x8_t lhs, uint16x8_t rhs) { return vmull_high_u16(lhs, rhs); } static inline int64x2_t vmull_high(int32x4_t lhs, int32x4_t rhs) { return vmull_high_s32(lhs, rhs); } +// ----------------------------------------------------------------------------- +// vmulq_n* +// ----------------------------------------------------------------------------- + +static inline int16x8_t vmulq_n(int16x8_t lhs, int16_t rhs) { return vmulq_n_s16(lhs, rhs); } +static inline uint16x8_t vmulq_n(uint16x8_t lhs, uint16_t rhs) { return vmulq_n_u16(lhs, rhs); } +static inline int32x4_t vmulq_n(int32x4_t lhs, int32_t rhs) { return vmulq_n_s32(lhs, rhs); } +static inline uint32x4_t vmulq_n(uint32x4_t lhs, uint32_t rhs) { return vmulq_n_u32(lhs, rhs); } +static inline float32x4_t vmulq_n(float32x4_t lhs, float32_t rhs) { return vmulq_n_f32(lhs, rhs); } +static inline float64x2_t vmulq_n(float64x2_t lhs, float64_t rhs) { return vmulq_n_f64(lhs, rhs); } + +// ----------------------------------------------------------------------------- +// vmlaq_n* +// ----------------------------------------------------------------------------- + +static inline int16x8_t vmlaq_n(int16x8_t a, int16x8_t b, int16_t c) { return vmlaq_n_s16(a, b, c); } +static inline uint16x8_t vmlaq_n(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlaq_n_u16(a, b, c); } +static inline int32x4_t vmlaq_n(int32x4_t a, int32x4_t b, int32_t c) { return vmlaq_n_s32(a, b, c); } +static inline uint32x4_t vmlaq_n(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlaq_n_u32(a, b, c); } +static inline float32x4_t vmlaq_n(float32x4_t a, float32x4_t b, float32_t c) { return vmlaq_n_f32(a, b, c); } + // ----------------------------------------------------------------------------- // vqmovn* // ----------------------------------------------------------------------------- @@ -261,6 +282,17 @@ static inline int16x4_t vqmovn(int32x4_t src) { return vqmovn_s32(src); } static inline uint16x4_t vqmovn(uint32x4_t src) { return vqmovn_u32(src); } static inline int32x2_t vqmovn(int64x2_t src) { return vqmovn_s64(src); } +// ----------------------------------------------------------------------------- +// vqmovl* +// ----------------------------------------------------------------------------- + +static inline int16x8_t vmovl(int8x8_t src) { return vmovl_s8(src); } +static inline uint16x8_t vmovl(uint8x8_t src) { return vmovl_u8(src); } +static inline int32x4_t vmovl(int16x4_t src) { return vmovl_s16(src); } +static inline uint32x4_t vmovl(uint16x4_t src) { return vmovl_u16(src); } +static inline int64x2_t vmovl(int32x2_t src) { return vmovl_s32(src); } +static inline uint64x2_t vmovl(uint32x2_t src) { return vmovl_u32(src); } + // ----------------------------------------------------------------------------- // vqmovn_high* // ----------------------------------------------------------------------------- diff --git a/kleidicv/include/kleidicv/traits.h b/kleidicv/include/kleidicv/traits.h index 17b56cb4f..5f2af4e09 100644 --- a/kleidicv/include/kleidicv/traits.h +++ b/kleidicv/include/kleidicv/traits.h @@ -174,6 +174,13 @@ class double_element_width; template using double_element_width_t = typename double_element_width::type; +// Returns a type which has half the element size of that of type T. +template +class element_half_width; + +template +using element_half_width_t = typename element_half_width::type; + template <> class double_element_width { public: diff --git a/kleidicv/src/filters/separable_filter_2d_api.cpp b/kleidicv/src/filters/separable_filter_2d_api.cpp index 5e6a222a2..bf35798c6 100644 --- a/kleidicv/src/filters/separable_filter_2d_api.cpp +++ b/kleidicv/src/filters/separable_filter_2d_api.cpp @@ -62,3 +62,8 @@ KLEIDICV_MULTIVERSION_C_API( kleidicv_separable_filter_2d_u8, &kleidicv::neon::separable_filter_2d_u8, KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::separable_filter_2d_u8), &kleidicv::sme2::separable_filter_2d_u8); + +KLEIDICV_MULTIVERSION_C_API( + kleidicv_separable_filter_2d_u16, &kleidicv::neon::separable_filter_2d_u16, + KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::separable_filter_2d_u16), + &kleidicv::sme2::separable_filter_2d_u16); diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 9f2a66123..be62c466d 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -18,17 +18,25 @@ class SeparableFilter2D; template class SeparableFilter2DArbitrary; -template -class SeparableFilter2D { +template +class SeparableFilter2D { public: - using SourceType = uint8_t; - using BufferType = uint8_t; - using DestinationType = uint8_t; - - explicit SeparableFilter2D(const uint8_t *kernel_x, const uint8_t *kernel_y) + using SourceType = ScalarType; + using BufferType = ScalarType; + using DestinationType = ScalarType; + + using SourceVectorType = typename VecTraits::VectorType; + using DoubleBufferType = double_element_width_t; + using InnerVectorType = typename VecTraits::VectorType; + using InnerHalfVectorType = half_element_width; + using InnerVectorHalfType = element_half_width_t; + + explicit SeparableFilter2D(const ScalarType *kernel_x, + const ScalarType *kernel_y) : kernel_x_(kernel_x), kernel_y_(kernel_y) {} - void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { + void vertical_vector_path(SourceVectorType src[KernelSize], + BufferType *dst) const { this->vector_path_with_kernel(src, dst, kernel_y_); } @@ -37,7 +45,7 @@ class SeparableFilter2D { this->scalar_path_with_kernel(src, dst, kernel_y_); } - void horizontal_vector_path(uint8x16_t src[KernelSize], + void horizontal_vector_path(SourceVectorType src[KernelSize], DestinationType *dst) const { this->vector_path_with_kernel(src, dst, kernel_x_); } @@ -48,40 +56,42 @@ class SeparableFilter2D { } private: - void vector_path_with_kernel(uint8x16_t src[KernelSize], uint8_t *dst, - const uint8_t *kernel) const { - uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); - uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); + void vector_path_with_kernel(SourceVectorType src[KernelSize], + ScalarType *dst, + const ScalarType *kernel) const { + InnerVectorType acc_l = vmovl(vget_low(src[0])); + InnerVectorType acc_h = vmovl(vget_high(src[0])); - acc_l = vmulq_n_u16(acc_l, kernel[0]); - acc_h = vmulq_n_u16(acc_h, kernel[0]); + acc_l = vmulq_n(acc_l, static_cast(kernel[0])); + acc_h = vmulq_n(acc_h, static_cast(kernel[0])); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 1; i < KernelSize; i++) { - uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); - uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); + InnerVectorType vec_l = vmovl(vget_low(src[i])); + InnerVectorType vec_h = vmovl(vget_high(src[i])); - acc_l = vmlaq_n_u16(acc_l, vec_l, kernel[i]); - acc_h = vmlaq_n_u16(acc_h, vec_h, kernel[i]); + acc_l = vmlaq_n(acc_l, vec_l, static_cast(kernel[i])); + acc_h = vmlaq_n(acc_h, vec_h, static_cast(kernel[i])); } - uint8x8_t result_l = vqmovn_u16(acc_l); - uint8x16_t result = vqmovn_high_u16(result_l, acc_h); + InnerVectorHalfType result_l = vqmovn(acc_l); + SourceVectorType result = vqmovn_high(result_l, acc_h); - vst1q_u8(&dst[0], result); + vst1q(&dst[0], result); } - void scalar_path_with_kernel(const uint8_t src[KernelSize], uint8_t *dst, - const uint8_t *kernel) const { - uint8_t acc; // NOLINT + void scalar_path_with_kernel(const ScalarType src[KernelSize], + ScalarType *dst, + const ScalarType *kernel) const { + ScalarType acc; // NOLINT if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { dst[0] = std::numeric_limits::max(); return; } for (size_t i = 1; i < KernelSize; i++) { - uint8_t temp; // NOLINT + ScalarType temp; // NOLINT if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { dst[0] = std::numeric_limits::max(); return; @@ -95,23 +105,29 @@ class SeparableFilter2D { dst[0] = acc; } - const uint8_t *kernel_x_; - const uint8_t *kernel_y_; -}; // end of class SeparableFilter2D + const ScalarType *kernel_x_; + const ScalarType *kernel_y_; +}; // end of class SeparableFilter2D -template <> -class SeparableFilter2DArbitrary { +template +class SeparableFilter2DArbitrary { public: - using SourceType = uint8_t; - using BufferType = uint8_t; - using DestinationType = uint8_t; - - explicit SeparableFilter2DArbitrary(const uint8_t *kernel_x, - const uint8_t *kernel_y, + using SourceType = ScalarType; + using BufferType = ScalarType; + using DestinationType = ScalarType; + + using SourceVectorType = typename VecTraits::VectorType; + using DoubleBufferType = double_element_width_t; + using InnerVectorType = typename VecTraits::VectorType; + using InnerHalfVectorType = half_element_width; + using InnerVectorHalfType = element_half_width_t; + + explicit SeparableFilter2DArbitrary(const ScalarType *kernel_x, + const ScalarType *kernel_y, const size_t kernel_size) : kernel_x_(kernel_x), kernel_y_(kernel_y), kernel_size_(kernel_size) {} - void vertical_vector_path(uint8x16_t src[], BufferType *dst) const { + void vertical_vector_path(SourceVectorType src[], BufferType *dst) const { this->vector_path_with_kernel(src, dst, kernel_y_); } @@ -119,7 +135,8 @@ class SeparableFilter2DArbitrary { this->scalar_path_with_kernel(src, dst, kernel_y_); } - void horizontal_vector_path(uint8x16_t src[], DestinationType *dst) const { + void horizontal_vector_path(SourceVectorType src[], + DestinationType *dst) const { this->vector_path_with_kernel(src, dst, kernel_x_); } @@ -129,31 +146,31 @@ class SeparableFilter2DArbitrary { } private: - void vector_path_with_kernel(uint8x16_t src[], uint8_t *dst, - const uint8_t *kernel) const { - uint16x8_t acc_l = vmovl_u8(vget_low_u8(src[0])); - uint16x8_t acc_h = vmovl_u8(vget_high_u8(src[0])); + void vector_path_with_kernel(SourceVectorType src[], ScalarType *dst, + const ScalarType *kernel) const { + InnerVectorType acc_l = vmovl(vget_low(src[0])); + InnerVectorType acc_h = vmovl(vget_high(src[0])); - acc_l = vmulq_n_u16(acc_l, kernel[0]); - acc_h = vmulq_n_u16(acc_h, kernel[0]); + acc_l = vmulq_n(acc_l, static_cast(kernel[0])); + acc_h = vmulq_n(acc_h, static_cast(kernel[0])); for (size_t i = 1; i < kernel_size_; i++) { - uint16x8_t vec_l = vmovl_u8(vget_low_u8(src[i])); - uint16x8_t vec_h = vmovl_u8(vget_high_u8(src[i])); + InnerVectorType vec_l = vmovl(vget_low(src[i])); + InnerVectorType vec_h = vmovl(vget_high(src[i])); - acc_l = vmlaq_n_u16(acc_l, vec_l, kernel[i]); - acc_h = vmlaq_n_u16(acc_h, vec_h, kernel[i]); + acc_l = vmlaq_n(acc_l, vec_l, static_cast(kernel[i])); + acc_h = vmlaq_n(acc_h, vec_h, static_cast(kernel[i])); } - uint8x8_t result_l = vqmovn_u16(acc_l); - uint8x16_t result = vqmovn_high_u16(result_l, acc_h); + InnerVectorHalfType result_l = vqmovn(acc_l); + SourceVectorType result = vqmovn_high(result_l, acc_h); - vst1q_u8(&dst[0], result); + vst1q(&dst[0], result); } - void scalar_path_with_kernel(const uint8_t src[], uint8_t *dst, - const uint8_t *kernel) const { - uint8_t acc; // NOLINT + void scalar_path_with_kernel(const ScalarType src[], ScalarType *dst, + const ScalarType *kernel) const { + ScalarType acc; // NOLINT // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) if (__builtin_mul_overflow(src[0], kernel[0], &acc)) { dst[0] = std::numeric_limits::max(); @@ -161,7 +178,7 @@ class SeparableFilter2DArbitrary { } for (size_t i = 1; i < kernel_size_; i++) { - uint8_t temp; // NOLINT + ScalarType temp; // NOLINT if (__builtin_mul_overflow(src[i], kernel[i], &temp)) { dst[0] = std::numeric_limits::max(); return; @@ -175,10 +192,10 @@ class SeparableFilter2DArbitrary { dst[0] = acc; } - const uint8_t *kernel_x_; - const uint8_t *kernel_y_; + const ScalarType *kernel_x_; + const ScalarType *kernel_y_; const size_t kernel_size_; -}; // end of class SeparableFilter2DArbitrary +}; // end of class SeparableFilter2DArbitrary template static kleidicv_error_t separable_filter_2d_fixed_kernel_size( @@ -186,7 +203,7 @@ static kleidicv_error_t separable_filter_2d_fixed_kernel_size( Rows &dst_rows, const ScalarType *kernel_x, const ScalarType *kernel_y, size_t channels, FixedBorderType border_type, SeparableFilterWorkspace *workspace) { - using SeparableFilterClass = SeparableFilter2D; + using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; SeparableFilterDriver filter{filterClass}; @@ -195,7 +212,7 @@ static kleidicv_error_t separable_filter_2d_fixed_kernel_size( } template -static kleidicv_error_t separable_filter_2d( +static kleidicv_error_t separable_filter_2d_selector( const ScalarType *src, size_t src_stride, ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t channels, const ScalarType *kernel_x, const ScalarType *kernel_y, size_t kernel_size, @@ -235,12 +252,13 @@ static kleidicv_error_t separable_filter_2d( return KLEIDICV_OK; } -KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t separable_filter_2d_u8( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height, size_t channels, const uint8_t *kernel_x, - size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, - kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { +template +static kleidicv_error_t separable_filter_2d_entry( + const ScalarType *src, size_t src_stride, ScalarType *dst, + size_t dst_stride, size_t width, size_t height, size_t channels, + const ScalarType *kernel_x, size_t kernel_width, const ScalarType *kernel_y, + size_t kernel_height, kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { CHECK_POINTERS(context, kernel_x, kernel_y); auto *workspace = reinterpret_cast(context); auto fixed_border_type = get_fixed_border_type(border_type); @@ -277,9 +295,31 @@ kleidicv_error_t separable_filter_2d_u8( return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - return separable_filter_2d(src, src_stride, dst, dst_stride, rect, channels, - kernel_x, kernel_y, kernel_width, - *fixed_border_type, workspace); + return separable_filter_2d_selector( + src, src_stride, dst, dst_stride, rect, channels, kernel_x, kernel_y, + kernel_width, *fixed_border_type, workspace); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t separable_filter_2d_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint8_t *kernel_x, + size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { + return separable_filter_2d_entry( + src, src_stride, dst, dst_stride, width, height, channels, kernel_x, + kernel_width, kernel_y, kernel_height, border_type, context); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t separable_filter_2d_u16( + const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint16_t *kernel_x, + size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { + return separable_filter_2d_entry( + src, src_stride, dst, dst_stride, width, height, channels, kernel_x, + kernel_width, kernel_y, kernel_height, border_type, context); } } // namespace kleidicv::neon diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 9a18321f2..234d77428 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -169,6 +169,13 @@ static kleidicv_error_t separable_filter_2d_u8_sc( return KLEIDICV_OK; } +static kleidicv_error_t separable_filter_2d_u16_sc( + const uint16_t *, size_t, uint16_t *, size_t, size_t, size_t, size_t, + const uint16_t *, size_t, const uint16_t *, size_t, kleidicv_border_type_t, + kleidicv_filter_context_t *) KLEIDICV_STREAMING_COMPATIBLE { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; +} + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_SEPARABLE_FILTER_2D_SC_H diff --git a/kleidicv/src/filters/separable_filter_2d_sme2.cpp b/kleidicv/src/filters/separable_filter_2d_sme2.cpp index fc0857178..3628b9147 100644 --- a/kleidicv/src/filters/separable_filter_2d_sme2.cpp +++ b/kleidicv/src/filters/separable_filter_2d_sme2.cpp @@ -19,4 +19,17 @@ separable_filter_2d_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, kernel_width, kernel_y, kernel_height, border_type, context); } +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +separable_filter_2d_u16(const uint16_t *src, size_t src_stride, uint16_t *dst, + size_t dst_stride, size_t width, size_t height, + size_t channels, const uint16_t *kernel_x, + size_t kernel_width, const uint16_t *kernel_y, + size_t kernel_height, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + return separable_filter_2d_u16_sc( + src, src_stride, dst, dst_stride, width, height, channels, kernel_x, + kernel_width, kernel_y, kernel_height, border_type, context); +} + } // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/separable_filter_2d_sve2.cpp b/kleidicv/src/filters/separable_filter_2d_sve2.cpp index 0de532c1c..5eb65a670 100644 --- a/kleidicv/src/filters/separable_filter_2d_sve2.cpp +++ b/kleidicv/src/filters/separable_filter_2d_sve2.cpp @@ -18,4 +18,15 @@ kleidicv_error_t separable_filter_2d_u8( kernel_width, kernel_y, kernel_height, border_type, context); } +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t separable_filter_2d_u16( + const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, const uint16_t *kernel_x, + size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, + kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) { + return separable_filter_2d_u16_sc( + src, src_stride, dst, dst_stride, width, height, channels, kernel_x, + kernel_width, kernel_y, kernel_height, border_type, context); +} + } // namespace kleidicv::sve2 diff --git a/test/api/test_separable_filter_2d.cpp b/test/api/test_separable_filter_2d.cpp index 7ffbfa9df..bbd21d6e4 100644 --- a/test/api/test_separable_filter_2d.cpp +++ b/test/api/test_separable_filter_2d.cpp @@ -12,6 +12,7 @@ #include "test_config.h" KLEIDICV_API(separable_filter_2d, kleidicv_separable_filter_2d_u8, uint8_t) +KLEIDICV_API(separable_filter_2d, kleidicv_separable_filter_2d_u16, uint16_t) // Implements KernelTestParams for SeparableFilter2D operators. template @@ -235,6 +236,48 @@ TYPED_TEST(SeparableFilter2D, Arbitrary9x9) { EXPECT_EQ_ARRAY2D(dst_expected, dst); } +TYPED_TEST(SeparableFilter2D, Arbitrary9x9_u16) { + kleidicv_filter_context_t *context = nullptr; + ASSERT_EQ(KLEIDICV_OK, + kleidicv_filter_context_create(&context, 1, 9, 9, 9, 9)); + test::Array2D src{9, 9, test::Options::vector_length()}; + // clang-format off + src.set(0, 0, { 1, 2, 3, 4, 5, 6, 7, 8, 9}); + src.set(1, 0, { 2, 3, 4, 5, 6, 7, 8, 9, 1}); + src.set(2, 0, { 3, 4, 5, 6, 7, 8, 9, 1, 2}); + src.set(3, 0, { 4, 5, 6, 7, 8, 9, 1, 2, 3}); + src.set(4, 0, { 5, 6, 7, 8, 9, 1, 2, 3, 4}); + src.set(5, 0, { 6, 7, 8, 9, 1, 2, 3, 4, 5}); + src.set(6, 0, { 7, 8, 9, 1, 2, 3, 4, 5, 6}); + src.set(7, 0, { 8, 9, 1, 2, 3, 4, 5, 6, 7}); + src.set(8, 0, { 9, 1, 2, 3, 4, 5, 6, 7, 8}); + // clang-format on + + test::Array2D kernel{9, 1}; + kernel.set(0, 0, {5, 0, 5, 0, 5, 0, 5, 0, 5}); + + test::Array2D dst{9, 9, test::Options::vector_length()}; + EXPECT_EQ(KLEIDICV_OK, separable_filter_2d()( + src.data(), src.stride(), dst.data(), dst.stride(), + 9, 9, 1, kernel.data(), 9, kernel.data(), 9, + KLEIDICV_BORDER_TYPE_REPLICATE, context)); + EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + + test::Array2D dst_expected{9, 9, test::Options::vector_length()}; + // clang-format off + dst_expected.set(0, 0, { 2125, 2275, 2650, 2700, 3200, 3250, 3750, 3675, 4050}); + dst_expected.set(1, 0, { 2275, 2650, 2575, 3075, 2900, 3400, 3225, 3600, 3300}); + dst_expected.set(2, 0, { 2650, 2575, 2950, 2775, 3275, 3100, 3600, 3300, 3675}); + dst_expected.set(3, 0, { 2700, 3075, 2775, 3275, 2875, 3375, 2975, 3350, 3050}); + dst_expected.set(4, 0, { 3200, 2900, 3275, 2875, 3375, 2975, 3475, 3175, 3550}); + dst_expected.set(5, 0, { 3250, 3400, 3100, 3375, 2975, 3250, 2850, 3225, 2925}); + dst_expected.set(6, 0, { 3750, 3225, 3600, 2975, 3475, 2850, 3350, 3050, 3425}); + dst_expected.set(7, 0, { 3675, 3600, 3300, 3350, 3175, 3225, 3050, 3425, 3350}); + dst_expected.set(8, 0, { 4050, 3300, 3675, 3050, 3550, 2925, 3425, 3350, 3725}); + // clang-format on + EXPECT_EQ_ARRAY2D(dst_expected, dst); +} + TYPED_TEST(SeparableFilter2D, NullPointer) { using KernelTestParams = SeparableFilter2DKernelTestParams; kleidicv_filter_context_t *context = nullptr; -- GitLab