From 8f16da1bc06e2d8719ff06703cd3d7a2b91494f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Podgain=C3=B5i?= Date: Wed, 7 Aug 2024 18:27:36 +0200 Subject: [PATCH] Refactor and unify separable filter drivers --- .../kleidicv/separable_filter_15x15_neon.h | 211 -------------- .../kleidicv/separable_filter_15x15_sc.h | 264 ----------------- .../kleidicv/separable_filter_3x3_neon.h | 153 ---------- .../kleidicv/separable_filter_3x3_sc.h | 161 ----------- .../kleidicv/separable_filter_5x5_neon.h | 141 --------- .../kleidicv/separable_filter_5x5_sc.h | 179 ------------ .../kleidicv/separable_filter_7x7_neon.h | 155 ---------- .../kleidicv/separable_filter_7x7_sc.h | 195 ------------- .../kleidicv/separable_filter_driver_neon.h | 273 ++++++++++++++++++ .../kleidicv/separable_filter_driver_sc.h | 273 ++++++++++++++++++ .../include/kleidicv/workspace/border_15x15.h | 16 +- .../include/kleidicv/workspace/border_3x3.h | 6 +- .../include/kleidicv/workspace/border_5x5.h | 8 +- .../include/kleidicv/workspace/border_7x7.h | 8 +- kleidicv/src/filters/gaussian_blur_neon.cpp | 7 +- kleidicv/src/filters/gaussian_blur_sc.h | 7 +- .../src/filters/separable_filter_2d_neon.cpp | 4 +- kleidicv/src/filters/separable_filter_2d_sc.h | 4 +- kleidicv/src/filters/sobel_neon.cpp | 7 +- kleidicv/src/filters/sobel_sc.h | 7 +- 20 files changed, 568 insertions(+), 1511 deletions(-) delete mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_15x15_sc.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_3x3_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_3x3_sc.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_5x5_sc.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_neon.h delete mode 100644 kleidicv/include/kleidicv/separable_filter_7x7_sc.h create mode 100644 kleidicv/include/kleidicv/separable_filter_driver_neon.h create mode 100644 kleidicv/include/kleidicv/separable_filter_driver_sc.h diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/separable_filter_15x15_neon.h deleted file mode 100644 index 2475d1db3..000000000 --- a/kleidicv/include/kleidicv/separable_filter_15x15_neon.h +++ /dev/null @@ -1,211 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border_15x15.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 15x15 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 7UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[15]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); - src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]); - src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]); - src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]); - src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]); - src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]); - src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]); - src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]); - src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[15]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - src[5] = src_rows.at(border_offsets.c5())[index]; - src[6] = src_rows.at(border_offsets.c6())[index]; - src[7] = src_rows.at(border_offsets.c7())[index]; - src[8] = src_rows.at(border_offsets.c8())[index]; - src[9] = src_rows.at(border_offsets.c9())[index]; - src[10] = src_rows.at(border_offsets.c10())[index]; - src[11] = src_rows.at(border_offsets.c11())[index]; - src[12] = src_rows.at(border_offsets.c12())[index]; - src[13] = src_rows.at(border_offsets.c13())[index]; - src[14] = src_rows.at(border_offsets.c14())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; - auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; - auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; - auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; - auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; - auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; - auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; - auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; - auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; - - BufferVectorType src_a[15], src_b[15]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - src_a[5] = vld1q(&src_5[0]); - src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); - src_a[6] = vld1q(&src_6[0]); - src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); - src_a[7] = vld1q(&src_7[0]); - src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]); - src_a[8] = vld1q(&src_8[0]); - src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]); - src_a[9] = vld1q(&src_9[0]); - src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]); - src_a[10] = vld1q(&src_10[0]); - src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]); - src_a[11] = vld1q(&src_11[0]); - src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]); - src_a[12] = vld1q(&src_12[0]); - src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]); - src_a[13] = vld1q(&src_13[0]); - src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]); - src_a[14] = vld1q(&src_14[0]); - src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[15]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); - src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]); - src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]); - src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]); - src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]); - src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]); - src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]); - src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]); - src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - src[7] = src_rows.at(0, border_offsets.c7())[index]; - src[8] = src_rows.at(0, border_offsets.c8())[index]; - src[9] = src_rows.at(0, border_offsets.c9())[index]; - src[10] = src_rows.at(0, border_offsets.c10())[index]; - src[11] = src_rows.at(0, border_offsets.c11())[index]; - src[12] = src_rows.at(0, border_offsets.c12())[index]; - src[13] = src_rows.at(0, border_offsets.c13())[index]; - src[14] = src_rows.at(0, border_offsets.c14())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 15x15 separable filters driver type. -template -using SeparableFilter15x15 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/separable_filter_15x15_sc.h deleted file mode 100644 index f95067a09..000000000 --- a/kleidicv/include/kleidicv/separable_filter_15x15_sc.h +++ /dev/null @@ -1,264 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_15X15_SC_H -#define KLEIDICV_SEPARABLE_FILTER_15X15_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_15x15.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 15x15 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 7UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); - SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c5())[index]); - SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c6())[index]); - SourceVectorType src_7 = - svld1(pg, &src_rows.at(border_offsets.c7())[index]); - SourceVectorType src_8 = - svld1(pg, &src_rows.at(border_offsets.c8())[index]); - SourceVectorType src_9 = - svld1(pg, &src_rows.at(border_offsets.c9())[index]); - SourceVectorType src_10 = - svld1(pg, &src_rows.at(border_offsets.c10())[index]); - SourceVectorType src_11 = - svld1(pg, &src_rows.at(border_offsets.c11())[index]); - SourceVectorType src_12 = - svld1(pg, &src_rows.at(border_offsets.c12())[index]); - SourceVectorType src_13 = - svld1(pg, &src_rows.at(border_offsets.c13())[index]); - SourceVectorType src_14 = - svld1(pg, &src_rows.at(border_offsets.c14())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; - auto src_7 = &src_rows.at(0, border_offsets.c7())[index]; - auto src_8 = &src_rows.at(0, border_offsets.c8())[index]; - auto src_9 = &src_rows.at(0, border_offsets.c9())[index]; - auto src_10 = &src_rows.at(0, border_offsets.c10())[index]; - auto src_11 = &src_rows.at(0, border_offsets.c11())[index]; - auto src_12 = &src_rows.at(0, border_offsets.c12())[index]; - auto src_13 = &src_rows.at(0, border_offsets.c13())[index]; - auto src_14 = &src_rows.at(0, border_offsets.c14())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - BufferVectorType src_0_5 = svld1(pg, &src_5[0]); - BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); - BufferVectorType src_0_6 = svld1(pg, &src_6[0]); - BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); - BufferVectorType src_0_7 = svld1(pg, &src_7[0]); - BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1); - BufferVectorType src_0_8 = svld1(pg, &src_8[0]); - BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1); - BufferVectorType src_0_9 = svld1(pg, &src_9[0]); - BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1); - BufferVectorType src_0_10 = svld1(pg, &src_10[0]); - BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1); - BufferVectorType src_0_11 = svld1(pg, &src_11[0]); - BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1); - BufferVectorType src_0_12 = svld1(pg, &src_12[0]); - BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1); - BufferVectorType src_0_13 = svld1(pg, &src_13[0]); - BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1); - BufferVectorType src_0_14 = svld1(pg, &src_14[0]); - BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, src_0_5, src_0_6, src_0_7, src_0_8, - src_0_9, src_0_10, src_0_11, src_0_12, - src_0_13, src_0_14, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, - src_1_14, &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); - BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); - BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); - BufferVectorType src_7 = - svld1(pg, &src_rows.at(0, border_offsets.c7())[index]); - BufferVectorType src_8 = - svld1(pg, &src_rows.at(0, border_offsets.c8())[index]); - BufferVectorType src_9 = - svld1(pg, &src_rows.at(0, border_offsets.c9())[index]); - BufferVectorType src_10 = - svld1(pg, &src_rows.at(0, border_offsets.c10())[index]); - BufferVectorType src_11 = - svld1(pg, &src_rows.at(0, border_offsets.c11())[index]); - BufferVectorType src_12 = - svld1(pg, &src_rows.at(0, border_offsets.c12())[index]); - BufferVectorType src_13 = - svld1(pg, &src_rows.at(0, border_offsets.c13())[index]); - BufferVectorType src_14 = - svld1(pg, &src_rows.at(0, border_offsets.c14())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[15]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - src[7] = src_rows.at(0, border_offsets.c7())[index]; - src[8] = src_rows.at(0, border_offsets.c8())[index]; - src[9] = src_rows.at(0, border_offsets.c9())[index]; - src[10] = src_rows.at(0, border_offsets.c10())[index]; - src[11] = src_rows.at(0, border_offsets.c11())[index]; - src[12] = src_rows.at(0, border_offsets.c12())[index]; - src[13] = src_rows.at(0, border_offsets.c13())[index]; - src[14] = src_rows.at(0, border_offsets.c14())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 15x15 separable filters driver type. -template -using SeparableFilter15x15 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_15X15_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/separable_filter_3x3_neon.h deleted file mode 100644 index 3fecea047..000000000 --- a/kleidicv/include/kleidicv/separable_filter_3x3_neon.h +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border_3x3.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 1UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(border_offsets.c0())[index]; - auto src_1 = &src_rows.at(border_offsets.c1())[index]; - auto src_2 = &src_rows.at(border_offsets.c2())[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - SourceVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.vertical_vector_path(src_a, &dst_rows[index]); - filter_.vertical_vector_path( - src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - SourceVectorType src[3]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[3]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - - auto src_0_x2 = vld1q_x2(&src_0[0]); - auto src_1_x2 = vld1q_x2(&src_1[0]); - auto src_2_x2 = vld1q_x2(&src_2[0]); - - BufferVectorType src_a[3], src_b[3]; - src_a[0] = src_0_x2.val[0]; - src_b[0] = src_0_x2.val[1]; - src_a[1] = src_1_x2.val[0]; - src_b[1] = src_1_x2.val[1]; - src_a[2] = src_2_x2.val[0]; - src_b[2] = src_2_x2.val[1]; - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[3]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/separable_filter_3x3_sc.h deleted file mode 100644 index 6f624ae1c..000000000 --- a/kleidicv/include/kleidicv/separable_filter_3x3_sc.h +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_3X3_SC_H -#define KLEIDICV_SEPARABLE_FILTER_3X3_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_3x3.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 3x3 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 1UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, - &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[3]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 3x3 separable filters driver type. -template -using SeparableFilter3x3 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_3X3_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/separable_filter_5x5_neon.h deleted file mode 100644 index 34f4290d7..000000000 --- a/kleidicv/include/kleidicv/separable_filter_5x5_neon.h +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border_5x5.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 2UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[5]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[5]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - - BufferVectorType src_a[5], src_b[5]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[5]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_5X5_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/separable_filter_5x5_sc.h deleted file mode 100644 index 909e8ce18..000000000 --- a/kleidicv/include/kleidicv/separable_filter_5x5_sc.h +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_5X5_SC_H -#define KLEIDICV_SEPARABLE_FILTER_5X5_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_5x5.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 5x5 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 2UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[5]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 5x5 separable filters driver type. -template -using SeparableFilter5x5 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_5X5_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/separable_filter_7x7_neon.h deleted file mode 100644 index 4305d9d06..000000000 --- a/kleidicv/include/kleidicv/separable_filter_7x7_neon.h +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H -#define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H - -#include "kleidicv/neon.h" -#include "kleidicv/workspace/border_7x7.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 7x7 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = typename neon::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = typename neon::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) : filter_{filter} {} - - static constexpr size_t margin = 3UL; - - void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) { - SourceVectorType src[7]; - src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]); - filter_.vertical_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - SourceType src[7]; - src[0] = src_rows.at(border_offsets.c0())[index]; - src[1] = src_rows.at(border_offsets.c1())[index]; - src[2] = src_rows.at(border_offsets.c2())[index]; - src[3] = src_rows.at(border_offsets.c3())[index]; - src[4] = src_rows.at(border_offsets.c4())[index]; - src[5] = src_rows.at(border_offsets.c5())[index]; - src[6] = src_rows.at(border_offsets.c6())[index]; - filter_.vertical_scalar_path(src, &dst_rows[index]); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - LoopUnroll2 loop{width * src_rows.channels(), - BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; - - BufferVectorType src_a[7], src_b[7]; - src_a[0] = vld1q(&src_0[0]); - src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); - src_a[1] = vld1q(&src_1[0]); - src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); - src_a[2] = vld1q(&src_2[0]); - src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); - src_a[3] = vld1q(&src_3[0]); - src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); - src_a[4] = vld1q(&src_4[0]); - src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); - src_a[5] = vld1q(&src_5[0]); - src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]); - src_a[6] = vld1q(&src_6[0]); - src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]); - - filter_.horizontal_vector_path(src_a, &dst_rows[index]); - filter_.horizontal_vector_path( - src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); - }); - - loop.unroll_once([&](size_t index) { - BufferVectorType src[7]; - src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]); - src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]); - src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]); - src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]); - src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]); - src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]); - src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]); - filter_.horizontal_vector_path(src, &dst_rows[index]); - }); - - loop.tail([&](size_t index) { - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal_borders(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_scalar(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void process_horizontal_scalar(Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const { - BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 7x7 separable filters driver type. -template -using SeparableFilter7x7 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/separable_filter_7x7_sc.h deleted file mode 100644 index 33f204a10..000000000 --- a/kleidicv/include/kleidicv/separable_filter_7x7_sc.h +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#ifndef KLEIDICV_SEPARABLE_FILTER_7X7_SC_H -#define KLEIDICV_SEPARABLE_FILTER_7X7_SC_H - -#include "kleidicv/sve2.h" -#include "kleidicv/workspace/border_7x7.h" - -// It is used by SVE2 and SME2, the actual namespace will reflect it. -namespace KLEIDICV_TARGET_NAMESPACE { - -// Template for drivers of separable NxM filters. -template -class SeparableFilter; - -// Driver for a separable 7x7 filter. -template -class SeparableFilter { - public: - using SourceType = typename FilterType::SourceType; - using BufferType = typename FilterType::BufferType; - using DestinationType = typename FilterType::DestinationType; - using SourceVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using SourceVectorType = typename SourceVecTraits::VectorType; - using BufferVecTraits = - typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; - using BufferVectorType = typename BufferVecTraits::VectorType; - using BorderInfoType = - typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7; - using BorderType = FixedBorderType; - using BorderOffsets = typename BorderInfoType::Offsets; - - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE - : filter_{filter} {} - - static constexpr size_t margin = 3UL; - - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); - }); - - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); - }); - - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); - } - - // Processing of horizontal borders is always scalar because border offsets - // change for each and every element in the border. - void process_horizontal_borders( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - for (size_t index = 0; index < src_rows.channels(); ++index) { - disable_loop_vectorization(); - process_horizontal_border(src_rows, dst_rows, border_offsets, index); - } - } - - private: - void vertical_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - SourceVectorType src_0 = - svld1(pg, &src_rows.at(border_offsets.c0())[index]); - SourceVectorType src_1 = - svld1(pg, &src_rows.at(border_offsets.c1())[index]); - SourceVectorType src_2 = - svld1(pg, &src_rows.at(border_offsets.c2())[index]); - SourceVectorType src_3 = - svld1(pg, &src_rows.at(border_offsets.c3())[index]); - SourceVectorType src_4 = - svld1(pg, &src_rows.at(border_offsets.c4())[index]); - SourceVectorType src_5 = - svld1(pg, &src_rows.at(border_offsets.c5())[index]); - SourceVectorType src_6 = - svld1(pg, &src_rows.at(border_offsets.c6())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, &dst_rows[index]); - } - - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; - auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; - auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; - auto src_3 = &src_rows.at(0, border_offsets.c3())[index]; - auto src_4 = &src_rows.at(0, border_offsets.c4())[index]; - auto src_5 = &src_rows.at(0, border_offsets.c5())[index]; - auto src_6 = &src_rows.at(0, border_offsets.c6())[index]; - - BufferVectorType src_0_0 = svld1(pg, &src_0[0]); - BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); - BufferVectorType src_0_1 = svld1(pg, &src_1[0]); - BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); - BufferVectorType src_0_2 = svld1(pg, &src_2[0]); - BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - BufferVectorType src_0_3 = svld1(pg, &src_3[0]); - BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); - BufferVectorType src_0_4 = svld1(pg, &src_4[0]); - BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - BufferVectorType src_0_5 = svld1(pg, &src_5[0]); - BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); - BufferVectorType src_0_6 = svld1(pg, &src_6[0]); - BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, src_0_5, src_0_6, &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - &dst_rows[index + BufferVecTraits::num_lanes()]); - } - - void horizontal_vector_path(svbool_t pg, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { - BufferVectorType src_0 = - svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); - BufferVectorType src_1 = - svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); - BufferVectorType src_2 = - svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - BufferVectorType src_3 = - svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); - BufferVectorType src_4 = - svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); - BufferVectorType src_5 = - svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); - BufferVectorType src_6 = - svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, &dst_rows[index]); - } - - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { - BufferType src[7]; - src[0] = src_rows.at(0, border_offsets.c0())[index]; - src[1] = src_rows.at(0, border_offsets.c1())[index]; - src[2] = src_rows.at(0, border_offsets.c2())[index]; - src[3] = src_rows.at(0, border_offsets.c3())[index]; - src[4] = src_rows.at(0, border_offsets.c4())[index]; - src[5] = src_rows.at(0, border_offsets.c5())[index]; - src[6] = src_rows.at(0, border_offsets.c6())[index]; - filter_.horizontal_scalar_path(src, &dst_rows[index]); - } - - FilterType filter_; -}; // end of class SeparableFilter - -// Shorthand for 7x7 separable filters driver type. -template -using SeparableFilter7x7 = SeparableFilter; - -} // namespace KLEIDICV_TARGET_NAMESPACE - -#endif // KLEIDICV_SEPARABLE_FILTER_7X7_SC_H diff --git a/kleidicv/include/kleidicv/separable_filter_driver_neon.h b/kleidicv/include/kleidicv/separable_filter_driver_neon.h new file mode 100644 index 000000000..3f8c4ddbe --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_driver_neon.h @@ -0,0 +1,273 @@ +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H +#define KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H + +#include "kleidicv/config.h" +#include "kleidicv/neon.h" +#include "kleidicv/workspace/border_15x15.h" +#include "kleidicv/workspace/border_3x3.h" +#include "kleidicv/workspace/border_5x5.h" +#include "kleidicv/workspace/border_7x7.h" + +namespace kleidicv::neon { + +// Template for drivers of separable NxM filters. +template +class SeparableFilterDriver { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = typename neon::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = typename neon::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = typename neon::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilterDriver(FilterType filter) : filter_{filter} {} + + static constexpr size_t margin = KernelSize >> 1; + + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + SourceVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; + + if constexpr (KernelSize == 3) { + loop.unroll_twice([&](size_t index) { + auto src_0_x2 = vld1q_x2(&src_rows.at(border_offsets.c(0))[index]); + auto src_1_x2 = vld1q_x2(&src_rows.at(border_offsets.c(1))[index]); + auto src_2_x2 = vld1q_x2(&src_rows.at(border_offsets.c(2))[index]); + + SourceVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.vertical_vector_path(src_a, &dst_rows[index]); + filter_.vertical_vector_path( + src_b, &dst_rows[index + SourceVecTraits::num_lanes()]); + }); + } + + loop.unroll_once([&](size_t index) { + vertical_vector_path(src_rows, dst_rows, border_offsets, index, seq); + }); + + loop.tail([&](size_t index) { + vertical_scalar_path(src_rows, dst_rows, border_offsets, index, seq); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + LoopUnroll2 loop{width * src_rows.channels(), + BufferVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; + + loop.unroll_twice([&](size_t index) { + horizontal_vector_path_2x(src_rows, dst_rows, border_offsets, index); + }); + + loop.unroll_once([&](size_t index) { + horizontal_vector_path(src_rows, dst_rows, border_offsets, index, seq); + }); + + loop.tail([&](size_t index) { + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index, seq); + }); + } + + void process_horizontal_borders(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const { + constexpr auto seq = std::make_index_sequence{}; + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_scalar(src_rows, dst_rows, border_offsets, index, seq); + } + } + + private: + template + void vertical_vector_path(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + SourceVectorType src[KernelSize] = { + vld1q(&src_rows.at(border_offsets.c(SeqNum))[index])...}; + filter_.vertical_vector_path(src, &dst_rows[index]); + } + + template + void vertical_scalar_path(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + SourceType src[KernelSize] = { + src_rows.at(border_offsets.c(SeqNum))[index]...}; + filter_.vertical_scalar_path(src, &dst_rows[index]); + } + + void horizontal_vector_path_2x(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const { + if constexpr (KernelSize == 3) { + auto src_0_x2 = vld1q_x2(&src_rows.at(0, border_offsets.c(0))[index]); + auto src_1_x2 = vld1q_x2(&src_rows.at(0, border_offsets.c(1))[index]); + auto src_2_x2 = vld1q_x2(&src_rows.at(0, border_offsets.c(2))[index]); + + BufferVectorType src_a[3], src_b[3]; + src_a[0] = src_0_x2.val[0]; + src_b[0] = src_0_x2.val[1]; + src_a[1] = src_1_x2.val[0]; + src_b[1] = src_1_x2.val[1]; + src_a[2] = src_2_x2.val[0]; + src_b[2] = src_2_x2.val[1]; + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + } else if constexpr (KernelSize == 5) { + BufferVectorType src_a[5], src_b[5]; + src_a[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src_b[0] = vld1q(&src_rows.at( + 0, border_offsets.c(0))[index + BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src_b[1] = vld1q(&src_rows.at( + 0, border_offsets.c(1))[index + BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src_b[2] = vld1q(&src_rows.at( + 0, border_offsets.c(2))[index + BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src_b[3] = vld1q(&src_rows.at( + 0, border_offsets.c(3))[index + BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src_b[4] = vld1q(&src_rows.at( + 0, border_offsets.c(4))[index + BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + } else if constexpr (KernelSize == 7) { + BufferVectorType src_a[7], src_b[7]; + src_a[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src_b[0] = vld1q(&src_rows.at( + 0, border_offsets.c(0))[index + BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src_b[1] = vld1q(&src_rows.at( + 0, border_offsets.c(1))[index + BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src_b[2] = vld1q(&src_rows.at( + 0, border_offsets.c(2))[index + BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src_b[3] = vld1q(&src_rows.at( + 0, border_offsets.c(3))[index + BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src_b[4] = vld1q(&src_rows.at( + 0, border_offsets.c(4))[index + BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); + src_b[5] = vld1q(&src_rows.at( + 0, border_offsets.c(5))[index + BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); + src_b[6] = vld1q(&src_rows.at( + 0, border_offsets.c(6))[index + BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + } else if constexpr (KernelSize == 15) { + BufferVectorType src_a[15], src_b[15]; + src_a[0] = vld1q(&src_rows.at(0, border_offsets.c(0))[index]); + src_b[0] = vld1q(&src_rows.at( + 0, border_offsets.c(0))[index + BufferVecTraits::num_lanes()]); + src_a[1] = vld1q(&src_rows.at(0, border_offsets.c(1))[index]); + src_b[1] = vld1q(&src_rows.at( + 0, border_offsets.c(1))[index + BufferVecTraits::num_lanes()]); + src_a[2] = vld1q(&src_rows.at(0, border_offsets.c(2))[index]); + src_b[2] = vld1q(&src_rows.at( + 0, border_offsets.c(2))[index + BufferVecTraits::num_lanes()]); + src_a[3] = vld1q(&src_rows.at(0, border_offsets.c(3))[index]); + src_b[3] = vld1q(&src_rows.at( + 0, border_offsets.c(3))[index + BufferVecTraits::num_lanes()]); + src_a[4] = vld1q(&src_rows.at(0, border_offsets.c(4))[index]); + src_b[4] = vld1q(&src_rows.at( + 0, border_offsets.c(4))[index + BufferVecTraits::num_lanes()]); + src_a[5] = vld1q(&src_rows.at(0, border_offsets.c(5))[index]); + src_b[5] = vld1q(&src_rows.at( + 0, border_offsets.c(5))[index + BufferVecTraits::num_lanes()]); + src_a[6] = vld1q(&src_rows.at(0, border_offsets.c(6))[index]); + src_b[6] = vld1q(&src_rows.at( + 0, border_offsets.c(6))[index + BufferVecTraits::num_lanes()]); + src_a[7] = vld1q(&src_rows.at(0, border_offsets.c(7))[index]); + src_b[7] = vld1q(&src_rows.at( + 0, border_offsets.c(7))[index + BufferVecTraits::num_lanes()]); + src_a[8] = vld1q(&src_rows.at(0, border_offsets.c(8))[index]); + src_b[8] = vld1q(&src_rows.at( + 0, border_offsets.c(8))[index + BufferVecTraits::num_lanes()]); + src_a[9] = vld1q(&src_rows.at(0, border_offsets.c(9))[index]); + src_b[9] = vld1q(&src_rows.at( + 0, border_offsets.c(9))[index + BufferVecTraits::num_lanes()]); + src_a[10] = vld1q(&src_rows.at(0, border_offsets.c(10))[index]); + src_b[10] = vld1q(&src_rows.at( + 0, border_offsets.c(10))[index + BufferVecTraits::num_lanes()]); + src_a[11] = vld1q(&src_rows.at(0, border_offsets.c(11))[index]); + src_b[11] = vld1q(&src_rows.at( + 0, border_offsets.c(11))[index + BufferVecTraits::num_lanes()]); + src_a[12] = vld1q(&src_rows.at(0, border_offsets.c(12))[index]); + src_b[12] = vld1q(&src_rows.at( + 0, border_offsets.c(12))[index + BufferVecTraits::num_lanes()]); + src_a[13] = vld1q(&src_rows.at(0, border_offsets.c(13))[index]); + src_b[13] = vld1q(&src_rows.at( + 0, border_offsets.c(13))[index + BufferVecTraits::num_lanes()]); + src_a[14] = vld1q(&src_rows.at(0, border_offsets.c(14))[index]); + src_b[14] = vld1q(&src_rows.at( + 0, border_offsets.c(14))[index + BufferVecTraits::num_lanes()]); + + filter_.horizontal_vector_path(src_a, &dst_rows[index]); + filter_.horizontal_vector_path( + src_b, &dst_rows[index + BufferVecTraits::num_lanes()]); + } else { + static_assert(KernelSize != KernelSize, + "please define variants for other kernel sizes"); + } + } + + template + void horizontal_vector_path(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + BufferVectorType src[KernelSize] = { + vld1q(&src_rows.at(0, border_offsets.c(SeqNum))[index])...}; + filter_.horizontal_vector_path(src, &dst_rows[index]); + } + + template + void process_horizontal_scalar(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const { + BufferType src[KernelSize] = { + src_rows.at(0, border_offsets.c(SeqNum))[index]...}; + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilterDriver + +} // namespace kleidicv::neon + +#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_NEON_H diff --git a/kleidicv/include/kleidicv/separable_filter_driver_sc.h b/kleidicv/include/kleidicv/separable_filter_driver_sc.h new file mode 100644 index 000000000..ec227e961 --- /dev/null +++ b/kleidicv/include/kleidicv/separable_filter_driver_sc.h @@ -0,0 +1,273 @@ +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H +#define KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H + +#include "kleidicv/config.h" +#include "kleidicv/sve2.h" +#include "kleidicv/workspace/border_15x15.h" +#include "kleidicv/workspace/border_3x3.h" +#include "kleidicv/workspace/border_5x5.h" +#include "kleidicv/workspace/border_7x7.h" + +// It is used by SVE2 and SME2, the actual namespace will reflect it. +namespace KLEIDICV_TARGET_NAMESPACE { + +// Template for drivers of separable NxM filters. +template +class SeparableFilterDriver { + public: + using SourceType = typename FilterType::SourceType; + using BufferType = typename FilterType::BufferType; + using DestinationType = typename FilterType::DestinationType; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BorderInfoType = + typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo; + using BorderType = FixedBorderType; + using BorderOffsets = typename BorderInfoType::Offsets; + + explicit SeparableFilterDriver(FilterType filter) + KLEIDICV_STREAMING_COMPATIBLE : filter_{filter} {} + + static constexpr size_t margin = KernelSize >> 1; + + void process_vertical( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index, + seq); + }); + + loop.remaining([&](size_t index, + size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index, seq); + }); + } + + void process_horizontal(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const + KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg_all = BufferVecTraits::svptrue(); + LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + constexpr auto seq = std::make_index_sequence{}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, + index); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index, + seq); + }); + + loop.remaining( + [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index, + seq); + }); + } + + // Processing of horizontal borders is always scalar because border offsets + // change for each and every element in the border. + void process_horizontal_borders( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + for (size_t index = 0; index < src_rows.channels(); ++index) { + disable_loop_vectorization(); + process_horizontal_border(src_rows, dst_rows, border_offsets, index); + } + } + + private: + template + void vertical_vector_path( + svbool_t pg, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const KLEIDICV_STREAMING_COMPATIBLE { + filter_.vertical_vector_path( + pg, svld1(pg, &src_rows.at(border_offsets.c(SeqNum))[index])..., + &dst_rows[index]); + } + + void horizontal_vector_path_2x( + svbool_t pg, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + if constexpr (KernelSize == 3) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, + &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } else if constexpr (KernelSize == 5) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } else if constexpr (KernelSize == 7) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + + filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, + src_0_4, src_0_5, src_0_6, + &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + &dst_rows[index + BufferVecTraits::num_lanes()]); + } else if constexpr (KernelSize == 15) { + auto src_0 = &src_rows.at(0, border_offsets.c(0))[index]; + auto src_1 = &src_rows.at(0, border_offsets.c(1))[index]; + auto src_2 = &src_rows.at(0, border_offsets.c(2))[index]; + auto src_3 = &src_rows.at(0, border_offsets.c(3))[index]; + auto src_4 = &src_rows.at(0, border_offsets.c(4))[index]; + auto src_5 = &src_rows.at(0, border_offsets.c(5))[index]; + auto src_6 = &src_rows.at(0, border_offsets.c(6))[index]; + auto src_7 = &src_rows.at(0, border_offsets.c(7))[index]; + auto src_8 = &src_rows.at(0, border_offsets.c(8))[index]; + auto src_9 = &src_rows.at(0, border_offsets.c(9))[index]; + auto src_10 = &src_rows.at(0, border_offsets.c(10))[index]; + auto src_11 = &src_rows.at(0, border_offsets.c(11))[index]; + auto src_12 = &src_rows.at(0, border_offsets.c(12))[index]; + auto src_13 = &src_rows.at(0, border_offsets.c(13))[index]; + auto src_14 = &src_rows.at(0, border_offsets.c(14))[index]; + + BufferVectorType src_0_0 = svld1(pg, &src_0[0]); + BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1); + BufferVectorType src_0_1 = svld1(pg, &src_1[0]); + BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1); + BufferVectorType src_0_2 = svld1(pg, &src_2[0]); + BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); + BufferVectorType src_0_3 = svld1(pg, &src_3[0]); + BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); + BufferVectorType src_0_4 = svld1(pg, &src_4[0]); + BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); + BufferVectorType src_0_5 = svld1(pg, &src_5[0]); + BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); + BufferVectorType src_0_6 = svld1(pg, &src_6[0]); + BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); + BufferVectorType src_0_7 = svld1(pg, &src_7[0]); + BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1); + BufferVectorType src_0_8 = svld1(pg, &src_8[0]); + BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1); + BufferVectorType src_0_9 = svld1(pg, &src_9[0]); + BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1); + BufferVectorType src_0_10 = svld1(pg, &src_10[0]); + BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1); + BufferVectorType src_0_11 = svld1(pg, &src_11[0]); + BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1); + BufferVectorType src_0_12 = svld1(pg, &src_12[0]); + BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1); + BufferVectorType src_0_13 = svld1(pg, &src_13[0]); + BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1); + BufferVectorType src_0_14 = svld1(pg, &src_14[0]); + BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); + + filter_.horizontal_vector_path( + pg, src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6, + src_0_7, src_0_8, src_0_9, src_0_10, src_0_11, src_0_12, src_0_13, + src_0_14, &dst_rows[index]); + filter_.horizontal_vector_path( + pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, + src_1_14, &dst_rows[index + BufferVecTraits::num_lanes()]); + } else { + static_assert(KernelSize != KernelSize, + "please define variants for other kernel sizes"); + } + } + + template + void horizontal_vector_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, size_t index, + std::index_sequence) const + KLEIDICV_STREAMING_COMPATIBLE { + filter_.horizontal_vector_path( + pg, svld1(pg, &src_rows.at(0, border_offsets.c(SeqNum))[index])..., + &dst_rows[index]); + } + + void process_horizontal_border( + Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType src[KernelSize]; + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t i = 0; i < KernelSize; i++) { + src[i] = src_rows.at(0, border_offsets.c(i))[index]; + } + filter_.horizontal_scalar_path(src, &dst_rows[index]); + } + + FilterType filter_; +}; // end of class SeparableFilterDriver + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_SEPARABLE_FILTER_DRIVER_SC_H diff --git a/kleidicv/include/kleidicv/workspace/border_15x15.h b/kleidicv/include/kleidicv/workspace/border_15x15.h index eb3ae12ad..f020cbb98 100644 --- a/kleidicv/include/kleidicv/workspace/border_15x15.h +++ b/kleidicv/include/kleidicv/workspace/border_15x15.h @@ -31,21 +31,7 @@ class FixedBorderInfo final { : offsets_{o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14} {} - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - size_t c5() const { return offsets_[5]; } - size_t c6() const { return offsets_[6]; } - size_t c7() const { return offsets_[7]; } - size_t c8() const { return offsets_[8]; } - size_t c9() const { return offsets_[9]; } - size_t c10() const { return offsets_[10]; } - size_t c11() const { return offsets_[11]; } - size_t c12() const { return offsets_[12]; } - size_t c13() const { return offsets_[13]; } - size_t c14() const { return offsets_[14]; } + size_t c(size_t i) const { return offsets_[i]; } private: size_t offsets_[15]; diff --git a/kleidicv/include/kleidicv/workspace/border_3x3.h b/kleidicv/include/kleidicv/workspace/border_3x3.h index ecd5627d3..98b8dcfa9 100644 --- a/kleidicv/include/kleidicv/workspace/border_3x3.h +++ b/kleidicv/include/kleidicv/workspace/border_3x3.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -25,9 +25,7 @@ class FixedBorderInfo final { Offsets(size_t o0, size_t o1, size_t o2) : offsets_{o0, o1, o2} {} - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } + size_t c(size_t i) const { return offsets_[i]; } private: size_t offsets_[3]; diff --git a/kleidicv/include/kleidicv/workspace/border_5x5.h b/kleidicv/include/kleidicv/workspace/border_5x5.h index 06c2683bd..b5409b328 100644 --- a/kleidicv/include/kleidicv/workspace/border_5x5.h +++ b/kleidicv/include/kleidicv/workspace/border_5x5.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -28,11 +28,7 @@ class FixedBorderInfo final { Offsets(size_t o0, size_t o1, size_t o2, size_t o3, size_t o4) : offsets_{o0, o1, o2, o3, o4} {} - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } + size_t c(size_t i) const { return offsets_[i]; } private: size_t offsets_[5]; diff --git a/kleidicv/include/kleidicv/workspace/border_7x7.h b/kleidicv/include/kleidicv/workspace/border_7x7.h index 75bb86117..1abd0e28a 100644 --- a/kleidicv/include/kleidicv/workspace/border_7x7.h +++ b/kleidicv/include/kleidicv/workspace/border_7x7.h @@ -29,13 +29,7 @@ class FixedBorderInfo final { size_t o6) : offsets_{o0, o1, o2, o3, o4, o5, o6} {} - size_t c0() const { return offsets_[0]; } - size_t c1() const { return offsets_[1]; } - size_t c2() const { return offsets_[2]; } - size_t c3() const { return offsets_[3]; } - size_t c4() const { return offsets_[4]; } - size_t c5() const { return offsets_[5]; } - size_t c6() const { return offsets_[6]; } + size_t c(size_t i) const { return offsets_[i]; } private: size_t offsets_[7]; diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index bf4d82aa1..e9092553c 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -8,10 +8,7 @@ #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_15x15_neon.h" -#include "kleidicv/separable_filter_3x3_neon.h" -#include "kleidicv/separable_filter_5x5_neon.h" -#include "kleidicv/separable_filter_7x7_neon.h" +#include "kleidicv/separable_filter_driver_neon.h" #include "kleidicv/sigma.h" namespace kleidicv::neon { @@ -627,7 +624,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; + SeparableFilterDriver filter{blur}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index e11fb8a58..9b0cdd809 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -8,10 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_15x15_sc.h" -#include "kleidicv/separable_filter_3x3_sc.h" -#include "kleidicv/separable_filter_5x5_sc.h" -#include "kleidicv/separable_filter_7x7_sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sigma.h" #include "kleidicv/sve2.h" @@ -819,7 +816,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( using GaussianBlurFilter = GaussianBlur; GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; + SeparableFilterDriver filter{blur}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index 8d3d0d3ed..9faacb3ee 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -8,7 +8,7 @@ #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_5x5_neon.h" +#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { @@ -139,7 +139,7 @@ kleidicv_error_t separable_filter_2d_u8( using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilter filter{filterClass}; + SeparableFilterDriver filter{filterClass}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index 9ba9c9fb8..aa9fde64a 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -8,7 +8,7 @@ #include #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_5x5_sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -159,7 +159,7 @@ static kleidicv_error_t separable_filter_2d_u8_sc( using SeparableFilterClass = SeparableFilter2D; SeparableFilterClass filterClass{kernel_x, kernel_y}; - SeparableFilter filter{filterClass}; + SeparableFilterDriver filter{filterClass}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 09e108575..96959ebd8 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -6,7 +6,7 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/morphology/workspace.h" #include "kleidicv/neon.h" -#include "kleidicv/separable_filter_3x3_neon.h" +#include "kleidicv/separable_filter_driver_neon.h" namespace kleidicv::neon { @@ -157,7 +157,8 @@ kleidicv_error_t sobel_3x3_horizontal_s16_u8(const uint8_t *src, } HorizontalSobel3x3 horizontal_sobel; - SeparableFilter3x3> filter{horizontal_sobel}; + SeparableFilterDriver, 3> filter{ + horizontal_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -193,7 +194,7 @@ kleidicv_error_t sobel_3x3_vertical_s16_u8(const uint8_t *src, } VerticalSobel3x3 vertical_sobel; - SeparableFilter3x3> filter{vertical_sobel}; + SeparableFilterDriver, 3> filter{vertical_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 783cd55ad..ab6a0c6e5 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -7,7 +7,7 @@ #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" -#include "kleidicv/separable_filter_3x3_sc.h" +#include "kleidicv/separable_filter_driver_sc.h" #include "kleidicv/sve2.h" namespace KLEIDICV_TARGET_NAMESPACE { @@ -148,7 +148,8 @@ static kleidicv_error_t sobel_3x3_horizontal_s16_u8_sc( } HorizontalSobel3x3 horizontal_sobel; - SeparableFilter3x3> filter{horizontal_sobel}; + SeparableFilterDriver, 3> filter{ + horizontal_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; @@ -184,7 +185,7 @@ static kleidicv_error_t sobel_3x3_vertical_s16_u8_sc( } VerticalSobel3x3 vertical_sobel; - SeparableFilter3x3> filter{vertical_sobel}; + SeparableFilterDriver, 3> filter{vertical_sobel}; workspace->process(rect, src_rows, dst_rows, channels, FixedBorderType::REPLICATE, filter); return KLEIDICV_OK; -- GitLab