From cd2d8735dad70973cd282bacf12fec6e3ba5f66b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 7 Aug 2024 18:08:49 +0200 Subject: [PATCH 1/3] Revert "Refactor filenames" This reverts commit 8fd6ce58b92572a1dfd39d8d2131ece6e68d72db. --- kleidicv/include/kleidicv/debug.h | 2 +- ...ilter_driver_neon.h => separable_filter_driver_neon.h} | 6 +++--- .../{filter_driver_sc.h => separable_filter_driver_sc.h} | 8 ++++---- kleidicv/include/kleidicv/{sc.h => sve2.h} | 6 +++--- kleidicv/src/analysis/min_max_sc.h | 2 +- kleidicv/src/arithmetics/absdiff_sme2.cpp | 2 +- kleidicv/src/arithmetics/absdiff_sve2.cpp | 2 +- kleidicv/src/arithmetics/add_abs_with_threshold_sc.h | 2 +- kleidicv/src/arithmetics/add_sme2.cpp | 2 +- kleidicv/src/arithmetics/add_sve2.cpp | 2 +- kleidicv/src/arithmetics/compare_sc.h | 2 +- kleidicv/src/arithmetics/exp_sc.h | 2 +- kleidicv/src/arithmetics/multiply_sve2.cpp | 2 +- kleidicv/src/arithmetics/scale_sc.h | 2 +- kleidicv/src/arithmetics/sub_sme2.cpp | 2 +- kleidicv/src/arithmetics/sub_sve2.cpp | 2 +- kleidicv/src/arithmetics/threshold_sc.h | 2 +- kleidicv/src/conversions/float_conv_sc.h | 2 +- kleidicv/src/conversions/gray_to_rgb_sc.h | 2 +- kleidicv/src/conversions/rgb_to_rgb_sc.h | 2 +- kleidicv/src/conversions/rgb_to_yuv_sc.h | 2 +- kleidicv/src/conversions/yuv_sp_to_rgb_sc.h | 2 +- kleidicv/src/conversions/yuv_to_rgb_sc.h | 2 +- kleidicv/src/filters/gaussian_blur_neon.cpp | 2 +- kleidicv/src/filters/gaussian_blur_sc.h | 4 ++-- kleidicv/src/filters/separable_filter_2d_neon.cpp | 2 +- kleidicv/src/filters/separable_filter_2d_sc.h | 4 ++-- kleidicv/src/filters/sobel_neon.cpp | 2 +- kleidicv/src/filters/sobel_sc.h | 4 ++-- kleidicv/src/logical/bitwise_and_sc.h | 2 +- kleidicv/src/morphology/morphology_sc.h | 2 +- kleidicv/src/resize/resize_linear_sc.h | 2 +- kleidicv/src/resize/resize_sc.h | 2 +- 33 files changed, 43 insertions(+), 43 deletions(-) rename kleidicv/include/kleidicv/{filter_driver_neon.h => separable_filter_driver_neon.h} (97%) rename kleidicv/include/kleidicv/{filter_driver_sc.h => separable_filter_driver_sc.h} (97%) rename kleidicv/include/kleidicv/{sc.h => sve2.h} (99%) diff --git a/kleidicv/include/kleidicv/debug.h b/kleidicv/include/kleidicv/debug.h index 1c0c92cb1..3de11595f 100644 --- a/kleidicv/include/kleidicv/debug.h +++ b/kleidicv/include/kleidicv/debug.h @@ -9,7 +9,7 @@ #if KLEIDICV_TARGET_NEON #include "kleidicv/neon.h" #elif KLEIDICV_TARGET_SVE2 -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" #endif #include diff --git a/kleidicv/include/kleidicv/filter_driver_neon.h b/kleidicv/include/kleidicv/separable_filter_driver_neon.h similarity index 97% rename from kleidicv/include/kleidicv/filter_driver_neon.h rename to kleidicv/include/kleidicv/separable_filter_driver_neon.h index daacde7af..3a684b7c8 100644 --- a/kleidicv/include/kleidicv/filter_driver_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_driver_neon.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_FILTER_DRIVER_NEON_H -#define KLEIDICV_FILTER_DRIVER_NEON_H +#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H #include "kleidicv/config.h" #include "kleidicv/neon.h" @@ -137,4 +137,4 @@ class SeparableFilterDriver { } // namespace kleidicv::neon -#endif // KLEIDICV_FILTER_DRIVER_NEON_H +#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H diff --git a/kleidicv/include/kleidicv/filter_driver_sc.h b/kleidicv/include/kleidicv/separable_filter_driver_sc.h similarity index 97% rename from kleidicv/include/kleidicv/filter_driver_sc.h rename to kleidicv/include/kleidicv/separable_filter_driver_sc.h index 6bbb96eee..34e41b643 100644 --- a/kleidicv/include/kleidicv/filter_driver_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_driver_sc.h @@ -2,11 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_FILTER_DRIVER_SC_H -#define KLEIDICV_FILTER_DRIVER_SC_H +#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#define KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H #include "kleidicv/config.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" #include "kleidicv/workspace/border.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. @@ -147,4 +147,4 @@ class SeparableFilterDriver { } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_FILTER_DRIVER_SC_H +#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H diff --git a/kleidicv/include/kleidicv/sc.h b/kleidicv/include/kleidicv/sve2.h similarity index 99% rename from kleidicv/include/kleidicv/sc.h rename to kleidicv/include/kleidicv/sve2.h index c798d692c..ade877891 100644 --- a/kleidicv/include/kleidicv/sc.h +++ b/kleidicv/include/kleidicv/sve2.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SC_H -#define KLEIDICV_SC_H +#ifndef KLEIDICV_SVE2_H +#define KLEIDICV_SVE2_H #include @@ -527,4 +527,4 @@ static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING_COMPATIBLE { } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SC_H +#endif // KLEIDICV_SVE2_H diff --git a/kleidicv/src/analysis/min_max_sc.h b/kleidicv/src/analysis/min_max_sc.h index a294cf783..a42ebdff8 100644 --- a/kleidicv/src/analysis/min_max_sc.h +++ b/kleidicv/src/analysis/min_max_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/absdiff_sme2.cpp b/kleidicv/src/arithmetics/absdiff_sme2.cpp index 00323a852..9bf7c2184 100644 --- a/kleidicv/src/arithmetics/absdiff_sme2.cpp +++ b/kleidicv/src/arithmetics/absdiff_sme2.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sme2 { diff --git a/kleidicv/src/arithmetics/absdiff_sve2.cpp b/kleidicv/src/arithmetics/absdiff_sve2.cpp index 6d1877d3e..21e0eac79 100644 --- a/kleidicv/src/arithmetics/absdiff_sve2.cpp +++ b/kleidicv/src/arithmetics/absdiff_sve2.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h b/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h index 2dc36a106..3b8075b68 100644 --- a/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h +++ b/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/add_sme2.cpp b/kleidicv/src/arithmetics/add_sme2.cpp index ccd1b4a6e..7f21ffa7d 100644 --- a/kleidicv/src/arithmetics/add_sme2.cpp +++ b/kleidicv/src/arithmetics/add_sme2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sme2 { diff --git a/kleidicv/src/arithmetics/add_sve2.cpp b/kleidicv/src/arithmetics/add_sve2.cpp index 32a41dbad..670237eb6 100644 --- a/kleidicv/src/arithmetics/add_sve2.cpp +++ b/kleidicv/src/arithmetics/add_sve2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/compare_sc.h b/kleidicv/src/arithmetics/compare_sc.h index 10491c457..cdb2a7a51 100644 --- a/kleidicv/src/arithmetics/compare_sc.h +++ b/kleidicv/src/arithmetics/compare_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_COMPARE_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/exp_sc.h b/kleidicv/src/arithmetics/exp_sc.h index 02ff145fc..41f9f813e 100644 --- a/kleidicv/src/arithmetics/exp_sc.h +++ b/kleidicv/src/arithmetics/exp_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/arithmetics/exp_constants.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { template diff --git a/kleidicv/src/arithmetics/multiply_sve2.cpp b/kleidicv/src/arithmetics/multiply_sve2.cpp index fed1f1440..7edd481fb 100644 --- a/kleidicv/src/arithmetics/multiply_sve2.cpp +++ b/kleidicv/src/arithmetics/multiply_sve2.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/scale_sc.h b/kleidicv/src/arithmetics/scale_sc.h index c8eb64fe7..269c15b4c 100644 --- a/kleidicv/src/arithmetics/scale_sc.h +++ b/kleidicv/src/arithmetics/scale_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SCALE_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/arithmetics/sub_sme2.cpp b/kleidicv/src/arithmetics/sub_sme2.cpp index 4f9b8f8e9..1fe21043d 100644 --- a/kleidicv/src/arithmetics/sub_sme2.cpp +++ b/kleidicv/src/arithmetics/sub_sme2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sme2 { diff --git a/kleidicv/src/arithmetics/sub_sve2.cpp b/kleidicv/src/arithmetics/sub_sve2.cpp index 6be0690c3..43bdb97e3 100644 --- a/kleidicv/src/arithmetics/sub_sve2.cpp +++ b/kleidicv/src/arithmetics/sub_sve2.cpp @@ -5,7 +5,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace kleidicv::sve2 { diff --git a/kleidicv/src/arithmetics/threshold_sc.h b/kleidicv/src/arithmetics/threshold_sc.h index f0ac017f7..2868b9b27 100644 --- a/kleidicv/src/arithmetics/threshold_sc.h +++ b/kleidicv/src/arithmetics/threshold_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_THRESHOLD_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/float_conv_sc.h b/kleidicv/src/conversions/float_conv_sc.h index c98c955e3..d62bec2fd 100644 --- a/kleidicv/src/conversions/float_conv_sc.h +++ b/kleidicv/src/conversions/float_conv_sc.h @@ -9,7 +9,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/gray_to_rgb_sc.h b/kleidicv/src/conversions/gray_to_rgb_sc.h index ab4f49ae4..a0a1c7c24 100644 --- a/kleidicv/src/conversions/gray_to_rgb_sc.h +++ b/kleidicv/src/conversions/gray_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/gray_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/rgb_to_rgb_sc.h b/kleidicv/src/conversions/rgb_to_rgb_sc.h index f3d58ee29..8ec8cda39 100644 --- a/kleidicv/src/conversions/rgb_to_rgb_sc.h +++ b/kleidicv/src/conversions/rgb_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/rgb_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/rgb_to_yuv_sc.h b/kleidicv/src/conversions/rgb_to_yuv_sc.h index 2d5026792..99059a6df 100644 --- a/kleidicv/src/conversions/rgb_to_yuv_sc.h +++ b/kleidicv/src/conversions/rgb_to_yuv_sc.h @@ -10,7 +10,7 @@ #include "kleidicv/conversions/rgb_to_yuv.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h index bec3a2cc4..8998e54bb 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/yuv_sp_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/conversions/yuv_to_rgb_sc.h b/kleidicv/src/conversions/yuv_to_rgb_sc.h index 0af5d788f..a2ceaf9cf 100644 --- a/kleidicv/src/conversions/yuv_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_to_rgb_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/conversions/yuv_to_rgb.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index 13ee1b08c..e9092553c 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -5,10 +5,10 @@ #include #include "kleidicv/ctypes.h" -#include "kleidicv/filter_driver_neon.h" #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_driver_neon.h" #include "kleidicv/sigma.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index f5b6b0cdb..9b0cdd809 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -7,10 +7,10 @@ #include -#include "kleidicv/filter_driver_sc.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sigma.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index be40208aa..9faacb3ee 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -5,10 +5,10 @@ #include #include "kleidicv/ctypes.h" -#include "kleidicv/filter_driver_neon.h" #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 9a18321f2..aa9fde64a 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -7,9 +7,9 @@ #include -#include "kleidicv/filter_driver_sc.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/separable_filter_driver_sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index cd5b7bf66..96959ebd8 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -2,11 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "kleidicv/filter_driver_neon.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" +#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 4b9f0518a..ab6a0c6e5 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -5,10 +5,10 @@ #ifndef KLEIDICV_SOBEL_SC_H #define KLEIDICV_SOBEL_SC_H -#include "kleidicv/filter_driver_sc.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/separable_filter_driver_sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/logical/bitwise_and_sc.h b/kleidicv/src/logical/bitwise_and_sc.h index 3afe7b679..86ccf588b 100644 --- a/kleidicv/src/logical/bitwise_and_sc.h +++ b/kleidicv/src/logical/bitwise_and_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_BITWISE_AND_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/morphology/morphology_sc.h b/kleidicv/src/morphology/morphology_sc.h index e86af164f..bce498bb2 100644 --- a/kleidicv/src/morphology/morphology_sc.h +++ b/kleidicv/src/morphology/morphology_sc.h @@ -10,7 +10,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" #include "kleidicv/types.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index 53a80b5e1..c43b1e497 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { diff --git a/kleidicv/src/resize/resize_sc.h b/kleidicv/src/resize/resize_sc.h index d94e16ce1..b67d8bc66 100644 --- a/kleidicv/src/resize/resize_sc.h +++ b/kleidicv/src/resize/resize_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_RESIZE_SC_H #include "kleidicv/kleidicv.h" -#include "kleidicv/sc.h" +#include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { -- GitLab From c9cc59b3a4060033d8718bf3a476036336b3c07a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 7 Aug 2024 18:08:49 +0200 Subject: [PATCH 2/3] Revert "Refactor and unify separable filter drivers" This reverts commit b9d77008cc3cc81c448c1bde977c4c9035d88dd0. --- .../kleidicv/separable_filter_15x15_neon.h | 211 ++++++++++++++ .../kleidicv/separable_filter_15x15_sc.h | 264 ++++++++++++++++++ .../kleidicv/separable_filter_3x3_neon.h | 153 ++++++++++ ..._driver_sc.h => separable_filter_3x3_sc.h} | 127 +++++---- .../kleidicv/separable_filter_5x5_neon.h | 141 ++++++++++ .../kleidicv/separable_filter_5x5_sc.h | 179 ++++++++++++ .../kleidicv/separable_filter_7x7_neon.h | 155 ++++++++++ .../kleidicv/separable_filter_7x7_sc.h | 195 +++++++++++++ .../kleidicv/separable_filter_driver_neon.h | 140 ---------- kleidicv/src/filters/gaussian_blur_neon.cpp | 7 +- kleidicv/src/filters/gaussian_blur_sc.h | 7 +- .../src/filters/separable_filter_2d_neon.cpp | 4 +- kleidicv/src/filters/separable_filter_2d_sc.h | 4 +- kleidicv/src/filters/sobel_neon.cpp | 7 +- kleidicv/src/filters/sobel_sc.h | 7 +- 15 files changed, 1387 insertions(+), 214 deletions(-) create mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_neon.h create mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_sc.h create mode 100644 kleidicv/include/kleidicv/separable_filter_3x3_neon.h rename kleidicv/include/kleidicv/{separable_filter_driver_sc.h => separable_filter_3x3_sc.h} (52%) create mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_neon.h create mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_sc.h create mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_neon.h create mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_sc.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_driver_neon.h diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h new file mode 100644 index 000000000..7e6a4227f --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 15x15 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = 7UL; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[15]; + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); + src[7] = vld1q(&src_rows.at(border_offsets.c(7))[index]); + src[8] = vld1q(&src_rows.at(border_offsets.c(8))[index]); + src[9] = vld1q(&src_rows.at(border_offsets.c(9))[index]); + src[10] = vld1q(&src_rows.at(border_offsets.c(10))[index]); + src[11] = vld1q(&src_rows.at(border_offsets.c(11))[index]); + src[12] = vld1q(&src_rows.at(border_offsets.c(12))[index]); + src[13] = vld1q(&src_rows.at(border_offsets.c(13))[index]); + src[14] = vld1q(&src_rows.at(border_offsets.c(14))[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[15]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + src[3] = src_rows.at(border_offsets.c(3))[index]; + src[4] = src_rows.at(border_offsets.c(4))[index]; + src[5] = src_rows.at(border_offsets.c(5))[index]; + src[6] = src_rows.at(border_offsets.c(6))[index]; + src[7] = src_rows.at(border_offsets.c(7))[index]; + src[8] = src_rows.at(border_offsets.c(8))[index]; + src[9] = src_rows.at(border_offsets.c(9))[index]; + src[10] = src_rows.at(border_offsets.c(10))[index]; + src[11] = src_rows.at(border_offsets.c(11))[index]; + src[12] = src_rows.at(border_offsets.c(12))[index]; + src[13] = src_rows.at(border_offsets.c(13))[index]; + src[14] = src_rows.at(border_offsets.c(14))[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; + auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; + auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; + auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; + auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; + auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; + auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; + auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; + + BufferVectorType src_a[15], src_b[15]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_5[0]); + src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_6[0]); + src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); + src_a[7] = vld1q(&src_7[0]); + src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]); + src_a[8] = vld1q(&src_8[0]); + src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]); + src_a[9] = vld1q(&src_9[0]); + src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]); + src_a[10] = vld1q(&src_10[0]); + src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]); + src_a[11] = vld1q(&src_11[0]); + src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]); + src_a[12] = vld1q(&src_12[0]); + src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]); + src_a[13] = vld1q(&src_13[0]); + src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]); + src_a[14] = vld1q(&src_14[0]); + src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[15]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); + src[7] = vld1q(&src_rows.at(0, border_offsets.c(7))[index]); + src[8] = vld1q(&src_rows.at(0, border_offsets.c(8))[index]); + src[9] = vld1q(&src_rows.at(0, border_offsets.c(9))[index]); + src[10] = vld1q(&src_rows.at(0, border_offsets.c(10))[index]); + src[11] = vld1q(&src_rows.at(0, border_offsets.c(11))[index]); + src[12] = vld1q(&src_rows.at(0, border_offsets.c(12))[index]); + src[13] = vld1q(&src_rows.at(0, border_offsets.c(13))[index]); + src[14] = vld1q(&src_rows.at(0, border_offsets.c(14))[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[15]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; + src[7] = src_rows.at(0, border_offsets.c(7))[index]; + src[8] = src_rows.at(0, border_offsets.c(8))[index]; + src[9] = src_rows.at(0, border_offsets.c(9))[index]; + src[10] = src_rows.at(0, border_offsets.c(10))[index]; + src[11] = src_rows.at(0, border_offsets.c(11))[index]; + src[12] = src_rows.at(0, border_offsets.c(12))[index]; + src[13] = src_rows.at(0, border_offsets.c(13))[index]; + src[14] = src_rows.at(0, border_offsets.c(14))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 15x15 separable filters driver type. +template +using SeparableFilter15x15 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h new file mode 100644 index 000000000..b38193b51 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h @@ -0,0 +1,264 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_SC_H +#define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 15x15 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr size_t margin = 7UL; + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c(3))[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c(4))[index]); + SourceVectorType src_5 = + svld1(pg, &src_rows.at(border_offsets.c(5))[index]); + SourceVectorType src_6 = + svld1(pg, &src_rows.at(border_offsets.c(6))[index]); + SourceVectorType src_7 = + svld1(pg, &src_rows.at(border_offsets.c(7))[index]); + SourceVectorType src_8 = + svld1(pg, &src_rows.at(border_offsets.c(8))[index]); + SourceVectorType src_9 = + svld1(pg, &src_rows.at(border_offsets.c(9))[index]); + SourceVectorType src_10 = + svld1(pg, &src_rows.at(border_offsets.c(10))[index]); + SourceVectorType src_11 = + svld1(pg, &src_rows.at(border_offsets.c(11))[index]); + SourceVectorType src_12 = + svld1(pg, &src_rows.at(border_offsets.c(12))[index]); + SourceVectorType src_13 = + svld1(pg, &src_rows.at(border_offsets.c(13))[index]); + SourceVectorType src_14 = + svld1(pg, &src_rows.at(border_offsets.c(14))[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, src_7, src_8, src_9, src_10, src_11, + src_12, src_13, src_14, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; + auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; + auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; + auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; + auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; + auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; + auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; + auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + BufferVectorType src_0_7 = svld1(pg, &src_7[0]); + BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1); + BufferVectorType src_0_8 = svld1(pg, &src_8[0]); + BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1); + BufferVectorType src_0_9 = svld1(pg, &src_9[0]); + BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1); + BufferVectorType src_0_10 = svld1(pg, &src_10[0]); + BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1); + BufferVectorType src_0_11 = svld1(pg, &src_11[0]); + BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1); + BufferVectorType src_0_12 = svld1(pg, &src_12[0]); + BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1); + BufferVectorType src_0_13 = svld1(pg, &src_13[0]); + BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1); + BufferVectorType src_0_14 = svld1(pg, &src_14[0]); + BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, src_0_5, src_0_6, src_0_7, src_0_8, + src_0_9, src_0_10, src_0_11, src_0_12, + src_0_13, src_0_14, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, + src_1_14, &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); + BufferVectorType src_5 = + svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); + BufferVectorType src_6 = + svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); + BufferVectorType src_7 = + svld1(pg, &src_rows.at(0, border_offsets.c(7))[index]); + BufferVectorType src_8 = + svld1(pg, &src_rows.at(0, border_offsets.c(8))[index]); + BufferVectorType src_9 = + svld1(pg, &src_rows.at(0, border_offsets.c(9))[index]); + BufferVectorType src_10 = + svld1(pg, &src_rows.at(0, border_offsets.c(10))[index]); + BufferVectorType src_11 = + svld1(pg, &src_rows.at(0, border_offsets.c(11))[index]); + BufferVectorType src_12 = + svld1(pg, &src_rows.at(0, border_offsets.c(12))[index]); + BufferVectorType src_13 = + svld1(pg, &src_rows.at(0, border_offsets.c(13))[index]); + BufferVectorType src_14 = + svld1(pg, &src_rows.at(0, border_offsets.c(14))[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, src_7, src_8, src_9, src_10, src_11, + src_12, src_13, src_14, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[15]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; + src[7] = src_rows.at(0, border_offsets.c(7))[index]; + src[8] = src_rows.at(0, border_offsets.c(8))[index]; + src[9] = src_rows.at(0, border_offsets.c(9))[index]; + src[10] = src_rows.at(0, border_offsets.c(10))[index]; + src[11] = src_rows.at(0, border_offsets.c(11))[index]; + src[12] = src_rows.at(0, border_offsets.c(12))[index]; + src[13] = src_rows.at(0, border_offsets.c(13))[index]; + src[14] = src_rows.at(0, border_offsets.c(14))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 15x15 separable filters driver type. +template +using SeparableFilter15x15 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_15X15_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h new file mode 100644 index 000000000..d26e54d86 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 3x3 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = 1UL; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(border_offsets.c(2))[index]; + + auto src_0_x2 = vld1q_x2(&src_0[0]); + auto src_1_x2 = vld1q_x2(&src_1[0]); + auto src_2_x2 = vld1q_x2(&src_2[0]); + + SourceVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.vertical_vector_path(src_a, &dst_rows[index]); + filter_.vertical_vector_path( + src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + SourceVectorType src[3]; + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[3]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + + auto src_0_x2 = vld1q_x2(&src_0[0]); + auto src_1_x2 = vld1q_x2(&src_1[0]); + auto src_2_x2 = vld1q_x2(&src_2[0]); + + BufferVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[3]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_driver_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h similarity index 52% rename from kleidicv/include/kleidicv/separable_filter_driver_sc.h rename to kleidicv/include/kleidicv/separable_filter_3x3_sc.h index 34e41b643..8c7e092ab 100644 --- a/kleidicv/include/kleidicv/separable_filter_driver_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -1,11 +1,10 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H -#define KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_SC_H +#define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H -#include "kleidicv/config.h" #include "kleidicv/sve2.h" #include "kleidicv/workspace/border.h" @@ -13,8 +12,12 @@ namespace KLEIDICV_TARGET_NAMESPACE { // Template for drivers of separable NxM filters. -template -class SeparableFilterDriver { +template +class SeparableFilter; + +// Driver for a separable 3x3 filter. +template +class SeparableFilter { public: using SourceType = typename FilterType::SourceType; using BufferType = typename FilterType::BufferType; @@ -26,33 +29,30 @@ class SeparableFilterDriver { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilterDriver(FilterType filter) - KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} - static constexpr size_t margin = KernelSize >> 1; + static constexpr size_t margin = 1UL; void process_vertical( size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - constexpr auto seq = std::make_index_sequence{}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index, - seq); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining([&](size_t index, - size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index, seq); - }); + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } void process_horizontal(size_t width, Rows src_rows, @@ -61,23 +61,20 @@ class SeparableFilterDriver { KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - constexpr auto seq = std::make_index_sequence{}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index, seq); + index); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index, - seq); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index, - seq); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); }); } @@ -93,58 +90,72 @@ class SeparableFilterDriver { } private: - template - void vertical_vector_path( - svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const KLEIDICV_STREAMING_COMPATIBLE { - filter_.vertical_vector_path( - pg, svld1(pg, &src_rows.at(border_offsets.c(SeqNum))[index])..., - &dst_rows[index]); + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); } - template - void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const - KLEIDICV_STREAMING_COMPATIBLE { - filter_.horizontal_vector_path( - pg, svld1(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index])..., - &dst_rows[index]); - + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, + &dst_rows[index]); filter_.horizontal_vector_path( - pg, - svld1_vnum(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index], 1)..., + pg, src_1_0, src_1_1, src_1_2, &dst_rows[index + BufferVecTraits::num_lanes()]); } - template void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const + BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - filter_.horizontal_vector_path( - pg, svld1(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index])..., - &dst_rows[index]); + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); } void process_horizontal_border( Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[KernelSize]; - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 0; i < KernelSize; i++) { - src[i] = src_rows.at(0, border_offsets.c(i))[index]; - } + BufferType src[3]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } FilterType filter_; -}; // end of class SeparableFilterDriver +}; // end of class SeparableFilter + +// Shorthand for 3x3 separable filters driver type. +template +using SeparableFilter3x3 = SeparableFilter; } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#endif // KLEIDICV_SEPARABLE_FILTER_3X3_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h new file mode 100644 index 000000000..46746dbd8 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -0,0 +1,141 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 5x5 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = 2UL; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[5]; + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[5]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + src[3] = src_rows.at(border_offsets.c(3))[index]; + src[4] = src_rows.at(border_offsets.c(4))[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + + BufferVectorType src_a[5], src_b[5]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[5]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[5]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 5x5 separable filters driver type. +template +using SeparableFilter5x5 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h new file mode 100644 index 000000000..3ca4075cf --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -0,0 +1,179 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_SC_H +#define KLEIDICV_SEPARABLE_FILTER_5X5_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 5x5 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr size_t margin = 2UL; + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c(3))[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c(4))[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, + &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, + &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[5]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 5x5 separable filters driver type. +template +using SeparableFilter5x5 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_5X5_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h new file mode 100644 index 000000000..2d804d933 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H + +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 7x7 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = 3UL; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) { + SourceVectorType src[7]; + src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); + filter_.vertical_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + SourceType src[7]; + src[0] = src_rows.at(border_offsets.c(0))[index]; + src[1] = src_rows.at(border_offsets.c(1))[index]; + src[2] = src_rows.at(border_offsets.c(2))[index]; + src[3] = src_rows.at(border_offsets.c(3))[index]; + src[4] = src_rows.at(border_offsets.c(4))[index]; + src[5] = src_rows.at(border_offsets.c(5))[index]; + src[6] = src_rows.at(border_offsets.c(6))[index]; + filter_.vertical_scalar_path(src, &dst_rows[index]); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + + BufferVectorType src_a[7], src_b[7]; + src_a[0] = vld1q(&src_0[0]); + src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_1[0]); + src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_2[0]); + src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_3[0]); + src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_4[0]); + src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_5[0]); + src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_6[0]); + src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + }); + + loop.unroll_once([&](size_t index) { + BufferVectorType src[7]; + src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); + filter_.horizontal_vector_path(src, &dst_rows[index]); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + BufferType src[7]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 7x7 separable filters driver type. +template +using SeparableFilter7x7 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h new file mode 100644 index 000000000..eab3df4b1 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -0,0 +1,195 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_SC_H +#define KLEIDICV_SEPARABLE_FILTER_7X7_SC_H + +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilter; + +// Driver for a separable 7x7 filter. +template +class SeparableFilter { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + : filter_{filter} {} + + static constexpr size_t margin = 3UL; + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = + svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + SourceVectorType src_1 = + svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + SourceVectorType src_2 = + svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + SourceVectorType src_3 = + svld1(pg, &src_rows.at(border_offsets.c(3))[index]); + SourceVectorType src_4 = + svld1(pg, &src_rows.at(border_offsets.c(4))[index]); + SourceVectorType src_5 = + svld1(pg, &src_rows.at(border_offsets.c(5))[index]); + SourceVectorType src_6 = + svld1(pg, &src_rows.at(border_offsets.c(6))[index]); + filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, src_0_5, src_0_6, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } + + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index) const + KLEIDICV_STREAMING_COMPATIBLE { + BufferVectorType src_0 = + svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + BufferVectorType src_1 = + svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + BufferVectorType src_2 = + svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + BufferVectorType src_3 = + svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); + BufferVectorType src_4 = + svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); + BufferVectorType src_5 = + svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); + BufferVectorType src_6 = + svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); + filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, + src_6, &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[7]; + src[0] = src_rows.at(0, border_offsets.c(0))[index]; + src[1] = src_rows.at(0, border_offsets.c(1))[index]; + src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[3] = src_rows.at(0, border_offsets.c(3))[index]; + src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[5] = src_rows.at(0, border_offsets.c(5))[index]; + src[6] = src_rows.at(0, border_offsets.c(6))[index]; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilter + +// Shorthand for 7x7 separable filters driver type. +template +using SeparableFilter7x7 = SeparableFilter; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_7X7_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_driver_neon.h b/kleidicv/include/kleidicv/separable_filter_driver_neon.h deleted file mode 100644 index 3a684b7c8..000000000 --- a/kleidicv/include/kleidicv/separable_filter_driver_neon.h +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H - -#include "kleidicv/config.h" -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" - -namespace kleidicv::neon { - -// Template for drivers of separable NxM filters. -template -class SeparableFilterDriver { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = typename neon::FixedBorderInfo; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilterDriver(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = KernelSize >> 1; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - constexpr auto seq = std::make_index_sequence{}; - - loop.unroll_once([&](size_t index) { - vertical_vector_path(src_rows, dst_rows, border_offsets, index, seq); - }); - - loop.tail([&](size_t index) { - vertical_scalar_path(src_rows, dst_rows, border_offsets, index, seq); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - constexpr auto seq = std::make_index_sequence{}; - - loop.unroll_twice([&](size_t index) { - horizontal_vector_path_2x(src_rows, dst_rows, border_offsets, index, seq); - }); - - loop.unroll_once([&](size_t index) { - horizontal_vector_path(src_rows, dst_rows, border_offsets, index, seq); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index, seq); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - constexpr auto seq = std::make_index_sequence{}; - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index, seq); - } - } - - private: - template - void vertical_vector_path(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const { - SourceVectorType src[KernelSize] = { - vld1q(&src_rows.at(border_offsets.c(SeqNum))[index])...}; - filter_.vertical_vector_path(src, &dst_rows[index]); - } - - template - void vertical_scalar_path(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const { - SourceType src[KernelSize] = { - src_rows.at(border_offsets.c(SeqNum))[index]...}; - filter_.vertical_scalar_path(src, &dst_rows[index]); - } - - template - void horizontal_vector_path_2x(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const { - BufferVectorType src_a[KernelSize] = { - vld1q(&src_rows.at(0, border_offsets.c(SeqNum))[index])...}; - BufferVectorType src_b[KernelSize] = {vld1q(&src_rows.at( - 0, border_offsets.c(SeqNum))[index + BufferVecTraits::num_lanes()])...}; - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - template - void horizontal_vector_path(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const { - BufferVectorType src[KernelSize] = { - vld1q(&src_rows.at(0, border_offsets.c(SeqNum))[index])...}; - filter_.horizontal_vector_path(src, &dst_rows[index]); - } - - template - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index, - std::index_sequence) const { - BufferType src[KernelSize] = { - src_rows.at(0, border_offsets.c(SeqNum))[index]...}; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilterDriver - -} // namespace kleidicv::neon - -#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index e9092553c..bf4d82aa1 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -8,7 +8,10 @@ #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_driver_neon.h" +#include "kleidicv/separable_filter_15x15_neon.h" +#include "kleidicv/separable_filter_3x3_neon.h" +#include "kleidicv/separable_filter_5x5_neon.h" +#include "kleidicv/separable_filter_7x7_neon.h" #include "kleidicv/sigma.h" namespace kleidicv::neon { @@ -624,7 +627,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilterDriver filter{blur}; + SeparableFilter filter{blur}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 9b0cdd809..e11fb8a58 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -8,7 +8,10 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_driver_sc.h" +#include "kleidicv/separable_filter_15x15_sc.h" +#include "kleidicv/separable_filter_3x3_sc.h" +#include "kleidicv/separable_filter_5x5_sc.h" +#include "kleidicv/separable_filter_7x7_sc.h" #include "kleidicv/sigma.h" #include "kleidicv/sve2.h" @@ -816,7 +819,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilterDriver filter{blur}; + SeparableFilter filter{blur}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 9faacb3ee..8d3d0d3ed 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -8,7 +8,7 @@ #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_driver_neon.h" +#include "kleidicv/separable_filter_5x5_neon.h" namespace kleidicv::neon { @@ -139,7 +139,7 @@ kleidicv_error_t separable_filter_2d_u8( using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilterDriver filter{filterClass}; + SeparableFilter filter{filterClass}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index aa9fde64a..9ba9c9fb8 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_driver_sc.h" +#include "kleidicv/separable_filter_5x5_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -159,7 +159,7 @@ static kleidicv_error_t separable_filter_2d_u8_sc( using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilterDriver filter{filterClass}; + SeparableFilter filter{filterClass}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 96959ebd8..09e108575 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -6,7 +6,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_driver_neon.h" +#include "kleidicv/separable_filter_3x3_neon.h" namespace kleidicv::neon { @@ -157,8 +157,7 @@ kleidicv_error_t sobel_3x3_horizontal_s16_u8(const uint8_t *src, } HorizontalSobel3x3 horizontal_sobel; - SeparableFilterDriver, 3> filter{ - horizontal_sobel}; + SeparableFilter3x3> filter{horizontal_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -194,7 +193,7 @@ kleidicv_error_t sobel_3x3_vertical_s16_u8(const uint8_t *src, } VerticalSobel3x3 vertical_sobel; - SeparableFilterDriver, 3> filter{vertical_sobel}; + SeparableFilter3x3> filter{vertical_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index ab6a0c6e5..783cd55ad 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_driver_sc.h" +#include "kleidicv/separable_filter_3x3_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -148,8 +148,7 @@ static kleidicv_error_t sobel_3x3_horizontal_s16_u8_sc( } HorizontalSobel3x3 horizontal_sobel; - SeparableFilterDriver, 3> filter{ - horizontal_sobel}; + SeparableFilter3x3> filter{horizontal_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -185,7 +184,7 @@ static kleidicv_error_t sobel_3x3_vertical_s16_u8_sc( } VerticalSobel3x3 vertical_sobel; - SeparableFilterDriver, 3> filter{vertical_sobel}; + SeparableFilter3x3> filter{vertical_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; -- GitLab From 8d54cfac7e47f5f1635a0eb7f4856c9425c10dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 7 Aug 2024 18:08:50 +0200 Subject: [PATCH 3/3] Revert "Refactor and unify border header files" This reverts commit 54d18e79f5747982e098dea3ebb456f789572afd. --- .../kleidicv/separable_filter_15x15_neon.h | 154 +++++----- .../kleidicv/separable_filter_15x15_sc.h | 124 ++++---- .../kleidicv/separable_filter_3x3_neon.h | 40 +-- .../kleidicv/separable_filter_3x3_sc.h | 28 +- .../kleidicv/separable_filter_5x5_neon.h | 54 ++-- .../kleidicv/separable_filter_5x5_sc.h | 44 +-- .../kleidicv/separable_filter_7x7_neon.h | 74 ++--- .../kleidicv/separable_filter_7x7_sc.h | 60 ++-- kleidicv/include/kleidicv/workspace/border.h | 207 ------------- .../include/kleidicv/workspace/border_15x15.h | 276 ++++++++++++++++++ .../include/kleidicv/workspace/border_3x3.h | 116 ++++++++ .../include/kleidicv/workspace/border_5x5.h | 162 ++++++++++ .../include/kleidicv/workspace/border_7x7.h | 181 ++++++++++++ .../include/kleidicv/workspace/separable.h | 5 +- 14 files changed, 1026 insertions(+), 499 deletions(-) delete mode 100644 kleidicv/include/kleidicv/workspace/border.h create mode 100644 kleidicv/include/kleidicv/workspace/border_15x15.h create mode 100644 kleidicv/include/kleidicv/workspace/border_3x3.h create mode 100644 kleidicv/include/kleidicv/workspace/border_5x5.h create mode 100644 kleidicv/include/kleidicv/workspace/border_7x7.h diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h index 7e6a4227f..2475d1db3 100644 --- a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_15x15.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -42,41 +42,41 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[15]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); - src[7] = vld1q(&src_rows.at(border_offsets.c(7))[index]); - src[8] = vld1q(&src_rows.at(border_offsets.c(8))[index]); - src[9] = vld1q(&src_rows.at(border_offsets.c(9))[index]); - src[10] = vld1q(&src_rows.at(border_offsets.c(10))[index]); - src[11] = vld1q(&src_rows.at(border_offsets.c(11))[index]); - src[12] = vld1q(&src_rows.at(border_offsets.c(12))[index]); - src[13] = vld1q(&src_rows.at(border_offsets.c(13))[index]); - src[14] = vld1q(&src_rows.at(border_offsets.c(14))[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); + src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]); + src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]); + src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]); + src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]); + src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]); + src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]); + src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]); + src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[15]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - src[3] = src_rows.at(border_offsets.c(3))[index]; - src[4] = src_rows.at(border_offsets.c(4))[index]; - src[5] = src_rows.at(border_offsets.c(5))[index]; - src[6] = src_rows.at(border_offsets.c(6))[index]; - src[7] = src_rows.at(border_offsets.c(7))[index]; - src[8] = src_rows.at(border_offsets.c(8))[index]; - src[9] = src_rows.at(border_offsets.c(9))[index]; - src[10] = src_rows.at(border_offsets.c(10))[index]; - src[11] = src_rows.at(border_offsets.c(11))[index]; - src[12] = src_rows.at(border_offsets.c(12))[index]; - src[13] = src_rows.at(border_offsets.c(13))[index]; - src[14] = src_rows.at(border_offsets.c(14))[index]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + src[5] = src_rows.at(border_offsets.c5())[index]; + src[6] = src_rows.at(border_offsets.c6())[index]; + src[7] = src_rows.at(border_offsets.c7())[index]; + src[8] = src_rows.at(border_offsets.c8())[index]; + src[9] = src_rows.at(border_offsets.c9())[index]; + src[10] = src_rows.at(border_offsets.c10())[index]; + src[11] = src_rows.at(border_offsets.c11())[index]; + src[12] = src_rows.at(border_offsets.c12())[index]; + src[13] = src_rows.at(border_offsets.c13())[index]; + src[14] = src_rows.at(border_offsets.c14())[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -88,21 +88,21 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; - auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; - auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; - auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; - auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; - auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; - auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; - auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; - auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; + auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; + auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; + auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; + auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; + auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; + auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; + auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; BufferVectorType src_a[15], src_b[15]; src_a[0] = vld1q(&src_0[0]); @@ -143,21 +143,21 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[15]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); - src[7] = vld1q(&src_rows.at(0, border_offsets.c(7))[index]); - src[8] = vld1q(&src_rows.at(0, border_offsets.c(8))[index]); - src[9] = vld1q(&src_rows.at(0, border_offsets.c(9))[index]); - src[10] = vld1q(&src_rows.at(0, border_offsets.c(10))[index]); - src[11] = vld1q(&src_rows.at(0, border_offsets.c(11))[index]); - src[12] = vld1q(&src_rows.at(0, border_offsets.c(12))[index]); - src[13] = vld1q(&src_rows.at(0, border_offsets.c(13))[index]); - src[14] = vld1q(&src_rows.at(0, border_offsets.c(14))[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); + src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]); + src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]); + src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]); + src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]); + src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]); + src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]); + src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]); + src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -181,21 +181,21 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; - src[7] = src_rows.at(0, border_offsets.c(7))[index]; - src[8] = src_rows.at(0, border_offsets.c(8))[index]; - src[9] = src_rows.at(0, border_offsets.c(9))[index]; - src[10] = src_rows.at(0, border_offsets.c(10))[index]; - src[11] = src_rows.at(0, border_offsets.c(11))[index]; - src[12] = src_rows.at(0, border_offsets.c(12))[index]; - src[13] = src_rows.at(0, border_offsets.c(13))[index]; - src[14] = src_rows.at(0, border_offsets.c(14))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[7] = src_rows.at(0, border_offsets.c7())[index]; + src[8] = src_rows.at(0, border_offsets.c8())[index]; + src[9] = src_rows.at(0, border_offsets.c9())[index]; + src[10] = src_rows.at(0, border_offsets.c10())[index]; + src[11] = src_rows.at(0, border_offsets.c11())[index]; + src[12] = src_rows.at(0, border_offsets.c12())[index]; + src[13] = src_rows.at(0, border_offsets.c13())[index]; + src[14] = src_rows.at(0, border_offsets.c14())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h index b38193b51..f95067a09 100644 --- a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_15x15.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,35 +95,35 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(border_offsets.c1())[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(border_offsets.c2())[index]); SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c(3))[index]); + svld1(pg, &src_rows.at(border_offsets.c3())[index]); SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c(4))[index]); + svld1(pg, &src_rows.at(border_offsets.c4())[index]); SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c(5))[index]); + svld1(pg, &src_rows.at(border_offsets.c5())[index]); SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c(6))[index]); + svld1(pg, &src_rows.at(border_offsets.c6())[index]); SourceVectorType src_7 = - svld1(pg, &src_rows.at(border_offsets.c(7))[index]); + svld1(pg, &src_rows.at(border_offsets.c7())[index]); SourceVectorType src_8 = - svld1(pg, &src_rows.at(border_offsets.c(8))[index]); + svld1(pg, &src_rows.at(border_offsets.c8())[index]); SourceVectorType src_9 = - svld1(pg, &src_rows.at(border_offsets.c(9))[index]); + svld1(pg, &src_rows.at(border_offsets.c9())[index]); SourceVectorType src_10 = - svld1(pg, &src_rows.at(border_offsets.c(10))[index]); + svld1(pg, &src_rows.at(border_offsets.c10())[index]); SourceVectorType src_11 = - svld1(pg, &src_rows.at(border_offsets.c(11))[index]); + svld1(pg, &src_rows.at(border_offsets.c11())[index]); SourceVectorType src_12 = - svld1(pg, &src_rows.at(border_offsets.c(12))[index]); + svld1(pg, &src_rows.at(border_offsets.c12())[index]); SourceVectorType src_13 = - svld1(pg, &src_rows.at(border_offsets.c(13))[index]); + svld1(pg, &src_rows.at(border_offsets.c13())[index]); SourceVectorType src_14 = - svld1(pg, &src_rows.at(border_offsets.c(14))[index]); + svld1(pg, &src_rows.at(border_offsets.c14())[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, src_9, src_10, src_11, src_12, src_13, src_14, &dst_rows[index]); @@ -133,21 +133,21 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; - auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; - auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; - auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; - auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; - auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; - auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; - auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; - auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; + auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; + auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; + auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; + auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; + auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; + auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; + auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; + auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -195,35 +195,35 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); BufferVectorType src_7 = - svld1(pg, &src_rows.at(0, border_offsets.c(7))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c7())[index]); BufferVectorType src_8 = - svld1(pg, &src_rows.at(0, border_offsets.c(8))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c8())[index]); BufferVectorType src_9 = - svld1(pg, &src_rows.at(0, border_offsets.c(9))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c9())[index]); BufferVectorType src_10 = - svld1(pg, &src_rows.at(0, border_offsets.c(10))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c10())[index]); BufferVectorType src_11 = - svld1(pg, &src_rows.at(0, border_offsets.c(11))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c11())[index]); BufferVectorType src_12 = - svld1(pg, &src_rows.at(0, border_offsets.c(12))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c12())[index]); BufferVectorType src_13 = - svld1(pg, &src_rows.at(0, border_offsets.c(13))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c13())[index]); BufferVectorType src_14 = - svld1(pg, &src_rows.at(0, border_offsets.c(14))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c14())[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, src_9, src_10, src_11, src_12, src_13, src_14, &dst_rows[index]); @@ -234,21 +234,21 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; - src[7] = src_rows.at(0, border_offsets.c(7))[index]; - src[8] = src_rows.at(0, border_offsets.c(8))[index]; - src[9] = src_rows.at(0, border_offsets.c(9))[index]; - src[10] = src_rows.at(0, border_offsets.c(10))[index]; - src[11] = src_rows.at(0, border_offsets.c(11))[index]; - src[12] = src_rows.at(0, border_offsets.c(12))[index]; - src[13] = src_rows.at(0, border_offsets.c(13))[index]; - src[14] = src_rows.at(0, border_offsets.c(14))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; + src[7] = src_rows.at(0, border_offsets.c7())[index]; + src[8] = src_rows.at(0, border_offsets.c8())[index]; + src[9] = src_rows.at(0, border_offsets.c9())[index]; + src[10] = src_rows.at(0, border_offsets.c10())[index]; + src[11] = src_rows.at(0, border_offsets.c11())[index]; + src[12] = src_rows.at(0, border_offsets.c12())[index]; + src[13] = src_rows.at(0, border_offsets.c13())[index]; + src[14] = src_rows.at(0, border_offsets.c14())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h index d26e54d86..3fecea047 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_3x3.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -41,9 +41,9 @@ class SeparableFilter { SourceVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(border_offsets.c(2))[index]; + auto src_0 = &src_rows.at(border_offsets.c0())[index]; + auto src_1 = &src_rows.at(border_offsets.c1())[index]; + auto src_2 = &src_rows.at(border_offsets.c2())[index]; auto src_0_x2 = vld1q_x2(&src_0[0]); auto src_1_x2 = vld1q_x2(&src_1[0]); @@ -64,17 +64,17 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[3]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[3]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -86,9 +86,9 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; auto src_0_x2 = vld1q_x2(&src_0[0]); auto src_1_x2 = vld1q_x2(&src_1[0]); @@ -109,9 +109,9 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[3]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -135,9 +135,9 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h index 8c7e092ab..6f624ae1c 100644 --- a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_3x3.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,11 +95,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(border_offsets.c1())[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(border_offsets.c2())[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); } @@ -107,9 +107,9 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -130,11 +130,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); } @@ -143,9 +143,9 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h index 46746dbd8..34f4290d7 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_5x5.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -42,21 +42,21 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[5]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[5]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - src[3] = src_rows.at(border_offsets.c(3))[index]; - src[4] = src_rows.at(border_offsets.c(4))[index]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -68,11 +68,11 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; BufferVectorType src_a[5], src_b[5]; src_a[0] = vld1q(&src_0[0]); @@ -93,11 +93,11 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[5]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -121,11 +121,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h index 3ca4075cf..909e8ce18 100644 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_5X5_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_5x5.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,15 +95,15 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(border_offsets.c1())[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(border_offsets.c2())[index]); SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c(3))[index]); + svld1(pg, &src_rows.at(border_offsets.c3())[index]); SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c(4))[index]); + svld1(pg, &src_rows.at(border_offsets.c4())[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, &dst_rows[index]); } @@ -112,11 +112,11 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -141,15 +141,15 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, &dst_rows[index]); } @@ -159,11 +159,11 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h index 2d804d933..4305d9d06 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H #include "kleidicv/neon.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_7x7.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -26,7 +26,7 @@ class SeparableFilter { using BufferVecTraits = typename neon::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -42,25 +42,25 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { SourceVectorType src[7]; - src[0] = vld1q(&src_rows.at(border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c(6))[index]); + src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); filter_.vertical_vector_path(src, &dst_rows[index]); }); loop.tail([&](size_t index) { SourceType src[7]; - src[0] = src_rows.at(border_offsets.c(0))[index]; - src[1] = src_rows.at(border_offsets.c(1))[index]; - src[2] = src_rows.at(border_offsets.c(2))[index]; - src[3] = src_rows.at(border_offsets.c(3))[index]; - src[4] = src_rows.at(border_offsets.c(4))[index]; - src[5] = src_rows.at(border_offsets.c(5))[index]; - src[6] = src_rows.at(border_offsets.c(6))[index]; + src[0] = src_rows.at(border_offsets.c0())[index]; + src[1] = src_rows.at(border_offsets.c1())[index]; + src[2] = src_rows.at(border_offsets.c2())[index]; + src[3] = src_rows.at(border_offsets.c3())[index]; + src[4] = src_rows.at(border_offsets.c4())[index]; + src[5] = src_rows.at(border_offsets.c5())[index]; + src[6] = src_rows.at(border_offsets.c6())[index]; filter_.vertical_scalar_path(src, &dst_rows[index]); }); } @@ -72,13 +72,13 @@ class SeparableFilter { BufferVecTraits::num_lanes()}; loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; BufferVectorType src_a[7], src_b[7]; src_a[0] = vld1q(&src_0[0]); @@ -103,13 +103,13 @@ class SeparableFilter { loop.unroll_once([&](size_t index) { BufferVectorType src[7]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); + src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); + src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); + src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); + src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); + src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); + src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); + src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); filter_.horizontal_vector_path(src, &dst_rows[index]); }); @@ -133,13 +133,13 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const { BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h index eab3df4b1..33f204a10 100644 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h @@ -6,7 +6,7 @@ #define KLEIDICV_SEPARABLE_FILTER_7X7_SC_H #include "kleidicv/sve2.h" -#include "kleidicv/workspace/border.h" +#include "kleidicv/workspace/border_7x7.h" // It is used by SVE2 and SME2, the actual namespace will reflect it. namespace KLEIDICV_TARGET_NAMESPACE { @@ -29,7 +29,7 @@ class SeparableFilter { typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; using BufferVectorType = typename BufferVecTraits::VectorType; using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; @@ -95,19 +95,19 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(border_offsets.c1())[index]); SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(border_offsets.c2())[index]); SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c(3))[index]); + svld1(pg, &src_rows.at(border_offsets.c3())[index]); SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c(4))[index]); + svld1(pg, &src_rows.at(border_offsets.c4())[index]); SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c(5))[index]); + svld1(pg, &src_rows.at(border_offsets.c5())[index]); SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c(6))[index]); + svld1(pg, &src_rows.at(border_offsets.c6())[index]); filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, &dst_rows[index]); } @@ -116,13 +116,13 @@ class SeparableFilter { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; - auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; - auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; - auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; - auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; - auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; - auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; + auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; + auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; + auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; + auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; + auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; + auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; BufferVectorType src_0_0 = svld1(pg, &src_0[0]); BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); @@ -151,19 +151,19 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c(0))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c(1))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c(2))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c(3))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c(4))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c(5))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c(6))[index]); + svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, &dst_rows[index]); } @@ -173,13 +173,13 @@ class SeparableFilter { BorderOffsets border_offsets, size_t index) const KLEIDICV_STREAMING_COMPATIBLE { BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c(0))[index]; - src[1] = src_rows.at(0, border_offsets.c(1))[index]; - src[2] = src_rows.at(0, border_offsets.c(2))[index]; - src[3] = src_rows.at(0, border_offsets.c(3))[index]; - src[4] = src_rows.at(0, border_offsets.c(4))[index]; - src[5] = src_rows.at(0, border_offsets.c(5))[index]; - src[6] = src_rows.at(0, border_offsets.c(6))[index]; + src[0] = src_rows.at(0, border_offsets.c0())[index]; + src[1] = src_rows.at(0, border_offsets.c1())[index]; + src[2] = src_rows.at(0, border_offsets.c2())[index]; + src[3] = src_rows.at(0, border_offsets.c3())[index]; + src[4] = src_rows.at(0, border_offsets.c4())[index]; + src[5] = src_rows.at(0, border_offsets.c5())[index]; + src[6] = src_rows.at(0, border_offsets.c6())[index]; filter_.horizontal_scalar_path(src, &dst_rows[index]); } diff --git a/kleidicv/include/kleidicv/workspace/border.h b/kleidicv/include/kleidicv/workspace/border.h deleted file mode 100644 index 3e5a8d34c..000000000 --- a/kleidicv/include/kleidicv/workspace/border.h +++ /dev/null @@ -1,207 +0,0 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_WORKSPACE_BORDER_H -#define KLEIDICV_WORKSPACE_BORDER_H - -#include "border_types.h" -#include "kleidicv/kleidicv.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Border offsets for fixed-size filters. -template -class FixedBorderInfo; - -template -class FixedBorderInfo final { - public: - // Simple object holding read-only constant offsets. - // Note: We are not using the default constructor, but it is defined for the - // unreachable cases in the code below. NOLINTNEXTLINE - class Offsets final { - public: - Offsets() = default; - - template - explicit Offsets(Args... args) : offsets_{static_cast(args)...} { - static_assert(sizeof...(args) == KernelSize); - } - - size_t c(int i) const { return offsets_[i]; } - - private: - size_t offsets_[KernelSize]; - }; - - FixedBorderInfo(size_t length, FixedBorderType border_type) - : length_(length), border_type_(border_type) {} - - // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { - constexpr auto seq = std::make_integer_sequence> 1)>{}; - return get_no_border(seq); - } - - // Returns offsets for rows/columns affected by the top or the left border. - Offsets offsets_with_top_or_left_border(size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - constexpr auto seq = std::make_integer_sequence> 1)>{}; - switch (border_type_) { - case FixedBorderType::REPLICATE: - return get_border(index, seq); - break; - - case FixedBorderType::REFLECT: - return get_border(index, seq); - break; - - case FixedBorderType::WRAP: - return get_border(index, seq); - break; - - case FixedBorderType::REVERSE: - return get_border(index, seq); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows/columns affected by the bottom or the - // right border. - Offsets offsets_with_bottom_or_right_border(size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - constexpr auto seq = std::make_integer_sequence> 1)>{}; - index = length_ - index - 1; - switch (border_type_) { - case FixedBorderType::REPLICATE: - return get_border(index, seq); - break; - - case FixedBorderType::REFLECT: - return get_border(index, seq); - break; - - case FixedBorderType::WRAP: - return get_border(index, seq); - break; - - case FixedBorderType::REVERSE: - return get_border(index, seq); - break; - } - // Unreachable. Compiler should emit a warning-as-error if any cases are - // uncovered above. - return Offsets{}; // GCOVR_EXCL_LINE - } - - // Returns offsets for rows/columns affected by any border. - Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { - if (row_or_column_index < (KernelSize >> 1)) { - // Rows and columns have the same offsets. - return offsets_with_top_or_left_border(row_or_column_index); - } - if (row_or_column_index >= (length_ - (KernelSize >> 1))) { - // Rows and columns have the same offsets. - return offsets_with_bottom_or_right_border(row_or_column_index); - } - return offsets_without_border(); - } - - private: - // Creates the Offsets object containing offsets in the interval - // [-(KernelSize / 2), KernelSize / 2]. - template - inline Offsets get_no_border(std::integer_sequence) const - KLEIDICV_STREAMING_COMPATIBLE { - // Example (15x15): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, - // 7}; - return Offsets{(SeqNum - (KernelSize >> 1))..., 0, (SeqNum + 1)...}; - } - - // Creates the Offsets object containing offsets in various intervals - // depending on the row/column index, border type as well the border - // position used. NOLINTBEGIN(readability-function-cognitive-complexity) - template - inline Offsets get_border(int index, std::integer_sequence) - const KLEIDICV_STREAMING_COMPATIBLE { - if constexpr (BorderType == FixedBorderType::REPLICATE && !IsRight) { - // Example (15x15, index 4, left): Offsets{-4, -4, -4, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 5, 6, 7}; - return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -index) - ? -index - : (SeqNum - (KernelSize >> 1))..., - 0, (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::REPLICATE && IsRight) { - // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 4, 4, 4}; - return Offsets{(SeqNum - (KernelSize >> 1))..., 0, - (SeqNum >= index) ? index : (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::REFLECT && !IsRight) { - // Example (15x15, index 4, left): Offsets{-2, -3, -4, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 5, 6, 7}; - return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -index) - ? ((KernelSize >> 1) - (index << 1) - (SeqNum + 1)) - : (SeqNum - (KernelSize >> 1))..., - 0, (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::REFLECT && IsRight) { - // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 4, 3, 2}; - return Offsets{ - (SeqNum - (KernelSize >> 1))..., 0, - (SeqNum >= index) ? ((index << 1) - SeqNum) : (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::WRAP && !IsRight) { - // Example (15x15, index 4, left): Offsets{length_ - 7, length_ - 6, - // length_ - 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}; - return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -index) - ? (SeqNum - (KernelSize >> 1) + length_) - : (SeqNum - (KernelSize >> 1))..., - 0, (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::WRAP && IsRight) { - // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 5 - length_, 6 - length_, 7 - length_}; - return Offsets{ - (SeqNum - (KernelSize >> 1))..., 0, - (SeqNum >= index) ? (SeqNum - length_ + 1) : (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::REVERSE && !IsRight) { - // Example (15x15, index 4, left): Offsets{-1, -2, -3, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 5, 6, 7}; - return Offsets{(SeqNum - static_cast(KernelSize >> 1) < -index) - ? ((KernelSize >> 1) - (index << 1) - SeqNum) - : (SeqNum - (KernelSize >> 1))..., - 0, (SeqNum + 1)...}; - } - - if constexpr (BorderType == FixedBorderType::REVERSE && IsRight) { - // Example (15x15, index 4, right): Offsets{-7, -6, -5, -4, -3, -2, -1, 0, - // 1, 2, 3, 4, 3, 2, 1}; - return Offsets{ - (SeqNum - (KernelSize >> 1))..., 0, - (SeqNum >= index) ? ((index << 1) - (SeqNum + 1)) : (SeqNum + 1)...}; - } - } - // NOLINTEND(readability-function-cognitive-complexity) - - size_t length_; - FixedBorderType border_type_; -}; // end of class FixedBorderInfo - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_WORKSPACE_BORDER_H diff --git a/kleidicv/include/kleidicv/workspace/border_15x15.h b/kleidicv/include/kleidicv/workspace/border_15x15.h new file mode 100644 index 000000000..eb3ae12ad --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_15x15.h @@ -0,0 +1,276 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_15X15_H +#define KLEIDICV_WORKSPACE_BORDER_15X15_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 15x15 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6, size_t o7, size_t o8, size_t o9, size_t o10, size_t o11, + size_t o12, size_t o13, size_t o14) + : offsets_{o0, o1, o2, o3, o4, o5, o6, o7, + o8, o9, o10, o11, o12, o13, o14} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + size_t c5() const { return offsets_[5]; } + size_t c6() const { return offsets_[6]; } + size_t c7() const { return offsets_[7]; } + size_t c8() const { return offsets_[8]; } + size_t c9() const { return offsets_[9]; } + size_t c10() const { return offsets_[10]; } + size_t c11() const { return offsets_[11]; } + size_t c12() const { return offsets_[12]; } + size_t c13() const { return offsets_[13]; } + size_t c14() const { return offsets_[14]; } + + private: + size_t offsets_[15]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + + // NOLINTBEGIN(readability-function-cognitive-complexity) + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 1) { + return get(-1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(-2, -2, -2, -2, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(-3, -3, -3, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(-4, -4, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(-5, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return get(-6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(6, 5, 4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 1) { + return get(4, 3, 2, 1, 0, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(2, 1, 0, -1, -2, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(0, -1, -2, -3, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(-2, -3, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(-4, -5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return get(-6, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, + height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3, 4, 5, 6, + 7); + } else if (column_index == 1) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, + height_ - 3, height_ - 2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, + height_ - 3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(height_ - 7, height_ - 6, height_ - 5, height_ - 4, -3, -2, + -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(height_ - 7, height_ - 6, height_ - 5, -4, -3, -2, -1, 0, + 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(height_ - 7, height_ - 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, + 4, 5, 6, 7); + } else { + return get(height_ - 7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(7, 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 1) { + return get(5, 4, 3, 2, 1, 0, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 2) { + return get(3, 2, 1, 0, -1, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 3) { + return get(1, 0, -1, -2, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 4) { + return get(-1, -2, -3, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else if (column_index == 5) { + return get(-3, -4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return get(-5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 6); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 5, 5); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 4, 4); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 3, 3, 3); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 2, 2, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 1, 1, 1, 1, 1); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 6); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 5, 4); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 3, 2); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, 2, 1, 0); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, 1, 0, -1, -2); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 1, 0, -1, -2, -3, -4); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 0, -1, -2, -3, -4, -5, -6); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, + 7 - height_); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6 - height_, + 7 - height_); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5 - height_, + 6 - height_, 7 - height_); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4 - height_, + 5 - height_, 6 - height_, 7 - height_); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3 - height_, + 4 - height_, 5 - height_, 6 - height_, 7 - height_); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2 - height_, 3 - height_, + 4 - height_, 5 - height_, 6 - height_, 7 - height_); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1 - height_, 2 - height_, + 3 - height_, 4 - height_, 5 - height_, 6 - height_, + 7 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 7)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 5); + } else if (column_index == (height_ - 6)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 4, 3); + } else if (column_index == (height_ - 5)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 3, 2, 1); + } else if (column_index == (height_ - 4)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 2, 1, 0, -1); + } else if (column_index == (height_ - 3)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 1, 0, -1, -2, -3); + } else if (column_index == (height_ - 2)) { + return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 0, -1, -2, -3, -4, -5); + } else { + return get(-7, -6, -5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5, -6, -7); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + // NOLINTEND(readability-function-cognitive-complexity) + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 6U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 7U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6, size_t o7, size_t o8, size_t o9, size_t o10, + size_t o11, size_t o12, size_t o13, + size_t o14) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6, o7, + o8, o9, o10, o11, o12, o13, o14}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 15x15 filter border type. +template +using FixedBorderInfo15x15 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_15X15_H diff --git a/kleidicv/include/kleidicv/workspace/border_3x3.h b/kleidicv/include/kleidicv/workspace/border_3x3.h new file mode 100644 index 000000000..ecd5627d3 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_3x3.h @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_3X3_H +#define KLEIDICV_WORKSPACE_BORDER_3X3_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 3x3 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + Offsets() = default; + + Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + + private: + size_t offsets_[3]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const { return get(-1, 0, 1); } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(0, 0, 1); + break; + + case FixedBorderType::WRAP: + return get(height_ - 1, 0, 1); + break; + + case FixedBorderType::REVERSE: + return get(1, 0, 1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t /* column_index */) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + case FixedBorderType::REFLECT: + return get(-1, 0, 0); + break; + + case FixedBorderType::WRAP: + return get(-1, 0, 1 - height_); + break; + + case FixedBorderType::REVERSE: + return get(-1, 0, -1); + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index == 0U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index == (height_ - 1U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2) const { + return Offsets{o0, o1, o2}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 3x3 filter border type. +template +using FixedBorderInfo3x3 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_3X3_H diff --git a/kleidicv/include/kleidicv/workspace/border_5x5.h b/kleidicv/include/kleidicv/workspace/border_5x5.h new file mode 100644 index 000000000..06c2683bd --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_5x5.h @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_5X5_H +#define KLEIDICV_WORKSPACE_BORDER_5X5_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 5x5 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4) + : offsets_{o0, o1, o2, o3, o4} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + + private: + size_t offsets_[5]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-2, -1, 0, 1, 2); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 1, 2); + } else { + return get(-1, -1, 0, 1, 2); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(1, 0, 0, 1, 2); + } else { + return get(-1, -1, 0, 1, 2); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 2, height_ - 1, 0, 1, 2); + } else { + return get(height_ - 2, -1, 0, 1, 2); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(2, 1, 0, 1, 2); + } else { + return get(0, -1, 0, 1, 2); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 1); + } else { + return get(-2, -1, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 1); + } else { + return get(-2, -1, 0, 0, -1); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 2 - height_); + } else { + return get(-2, -1, 0, 1 - height_, 2 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 2)) { + return get(-2, -1, 0, 1, 0); + } else { + return get(-2, -1, 0, -1, -2); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 1U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 2U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, + size_t o4) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 5x5 filter border type. +template +using FixedBorderInfo5x5 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_5X5_H diff --git a/kleidicv/include/kleidicv/workspace/border_7x7.h b/kleidicv/include/kleidicv/workspace/border_7x7.h new file mode 100644 index 000000000..75bb86117 --- /dev/null +++ b/kleidicv/include/kleidicv/workspace/border_7x7.h @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_7X7_H +#define KLEIDICV_WORKSPACE_BORDER_7X7_H + +#include "border_types.h" +#include "kleidicv/kleidicv.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for fixed-size filters. +template +class FixedBorderInfo; + +// Border offsets for 7x7 filters. +template +class FixedBorderInfo final { + public: + // Simple object holding read-only constant offsets. + class Offsets final { + public: + // NOLINTBEGIN(hicpp-member-init) + Offsets() = default; + // NOLINTEND(hicpp-member-init) + + Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) + : offsets_{o0, o1, o2, o3, o4, o5, o6} {} + + size_t c0() const { return offsets_[0]; } + size_t c1() const { return offsets_[1]; } + size_t c2() const { return offsets_[2]; } + size_t c3() const { return offsets_[3]; } + size_t c4() const { return offsets_[4]; } + size_t c5() const { return offsets_[5]; } + size_t c6() const { return offsets_[6]; } + + private: + size_t offsets_[7]; + }; + + FixedBorderInfo(size_t height, FixedBorderType border_type) + : height_(height), border_type_(border_type) {} + + // Returns offsets without the influence of any border. + Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + return get(-3, -2, -1, 0, 1, 2, 3); + } + + // Returns offsets for columns affected by left border. + Offsets offsets_with_left_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == 0) { + return get(0, 0, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(-1, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == 0) { + return get(2, 1, 0, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(0, -1, -1, 0, 1, 2, 3); + } else { + return get(-2, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::WRAP: + if (column_index == 0) { + return get(height_ - 3, height_ - 2, height_ - 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(height_ - 3, height_ - 2, -1, 0, 1, 2, 3); + } else { + return get(height_ - 3, -2, -1, 0, 1, 2, 3); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == 0) { + return get(3, 2, 1, 0, 1, 2, 3); + } else if (column_index == 1) { + return get(1, 0, -1, 0, 1, 2, 3); + } else { + return get(-1, -2, -1, 0, 1, 2, 3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for columns affected by right border. + Offsets offsets_with_right_border(size_t column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + switch (border_type_) { + case FixedBorderType::REPLICATE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 1); + } else { + return get(-3, -2, -1, 0, 0, 0, 0); + } + break; + + case FixedBorderType::REFLECT: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 2); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 1, 0); + } else { + return get(-3, -2, -1, 0, 0, -1, -2); + } + break; + + case FixedBorderType::WRAP: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 3 - height_); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 2 - height_, 3 - height_); + } else { + return get(-3, -2, -1, 0, 1 - height_, 2 - height_, 3 - height_); + } + break; + + case FixedBorderType::REVERSE: + if (column_index == (height_ - 3)) { + return get(-3, -2, -1, 0, 1, 2, 1); + } else if (column_index == (height_ - 2)) { + return get(-3, -2, -1, 0, 1, 0, -1); + } else { + return get(-3, -2, -1, 0, -1, -2, -3); + } + break; + } + // Unreachable. Compiler should emit a warning-as-error if any cases are + // uncovered above. + return Offsets{}; // GCOVR_EXCL_LINE + } + + // Returns offsets for rows or columns affected by any border. + Offsets offsets_with_border(size_t row_or_column_index) const + KLEIDICV_STREAMING_COMPATIBLE { + if (row_or_column_index <= 2U) { + // Rows and columns have the same offsets. + return offsets_with_left_border(row_or_column_index); + } + if (row_or_column_index >= (height_ - 3U)) { + // Rows and columns have the same offsets. + return offsets_with_right_border(row_or_column_index); + } + return offsets_without_border(); + } + + private: + // Takes care of static signed to unsigned casts. + Offsets get(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4, size_t o5, + size_t o6) const KLEIDICV_STREAMING_COMPATIBLE { + return Offsets{o0, o1, o2, o3, o4, o5, o6}; + } + + size_t height_; + FixedBorderType border_type_; +}; // end of class FixedBorderInfo + +// Shorthand for 7x7 filter border type. +template +using FixedBorderInfo7x7 = FixedBorderInfo; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_7X7_H diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 7a341d51e..6a501686e 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -168,7 +168,7 @@ class SeparableFilterWorkspace final { for (size_t horizontal_index = 0; horizontal_index < margin; ++horizontal_index) { auto offsets = - horizontal_border.offsets_with_top_or_left_border(horizontal_index); + horizontal_border.offsets_with_left_border(horizontal_index); filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), dst_rows.at(0, horizontal_index), offsets); @@ -188,8 +188,7 @@ class SeparableFilterWorkspace final { for (size_t horizontal_index = 0; horizontal_index < margin; ++horizontal_index) { size_t index = width - margin + horizontal_index; - auto offsets = - horizontal_border.offsets_with_bottom_or_right_border(index); + auto offsets = horizontal_border.offsets_with_right_border(index); filter.process_horizontal_borders(buffer_rows.at(0, index), dst_rows.at(0, index), offsets); } -- GitLab