diff --git a/examples/extract_one_operation/kleidicv/config.h b/examples/extract_one_operation/kleidicv/config.h index 780908a0eb0fb409aa3ffa4bc447f2acc803cfe6..01e96eeb10b33efabe64a724f94aec9699b0e34f 100644 --- a/examples/extract_one_operation/kleidicv/config.h +++ b/examples/extract_one_operation/kleidicv/config.h @@ -6,7 +6,7 @@ // needs of the example. #define KLEIDICV_LOCALLY_STREAMING __arm_locally_streaming -#define KLEIDICV_STREAMING_COMPATIBLE __arm_streaming_compatible +#define KLEIDICV_STREAMING __arm_streaming #define KLEIDICV_UNLIKELY(cond) __builtin_expect((cond), 0) diff --git a/kleidicv/CMakeLists.txt b/kleidicv/CMakeLists.txt index 4371b13ec10d9e74eaa476ce40f97b7d680d0d62..070c9caa9e5224843d9ef7ce81f0d0baad41d56f 100644 --- a/kleidicv/CMakeLists.txt +++ b/kleidicv/CMakeLists.txt @@ -29,18 +29,6 @@ option(KLEIDICV_ENABLE_SVE2 "Enable SVE2 code paths" ${KLEIDICV_ENABLE_SVE2_DEFA # https://github.com/ARM-software/acle/blob/main/main/acle.md#sme-language-extensions-and-intrinsics # If SME2 is enabled than SME is also enabled by default. option(KLEIDICV_ENABLE_SME2 "Enable SME2 code paths" OFF) - -# Temporary while there is no SME2 code -file(GLOB KLEIDICV_SME2_SOURCES - "${CMAKE_CURRENT_LIST_DIR}/src/*_sme2.cpp" - "${CMAKE_CURRENT_LIST_DIR}/src/**/*_sme2.cpp" -) - -if (NOT KLEIDICV_SME2_SOURCES) - set(KLEIDICV_ENABLE_SME2 OFF) - message("KLEIDICV_ENABLE_SME2 was disabled since there is no SME2 sources") -endif() - if(KLEIDICV_ENABLE_SME2) option(KLEIDICV_ENABLE_SME "Enable SME code paths" ON) else() @@ -78,6 +66,8 @@ option(KLEIDICV_ASSUME_128BIT_SVE2 "Internal - If turned ON 128-bit SVE2 vector option(KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE "Internal - If turned ON interleaving loads and stores are preferred instead of continuous loads and stores" OFF) option(KLEIDICV_EXPERIMENTAL_FEATURE_CANNY "Internal - Enable experimental Canny algorithm" OFF) option(KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV "Internal - If turned ON Canny algorithm creates bit exact result compared to OpenCV's original implementation" ON) +# Marked experimental while CI does not test SME2 version of saturating add. +option(KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 "Internal - Enable SME2 version of saturating add" OFF) # Continuous load and store NEON instructions produce suboptimal code generation on GCC version <= 11, # and these instructions are not supported on GCC version <=8. @@ -121,6 +111,11 @@ file(GLOB KLEIDICV_SME_SOURCES "${CMAKE_CURRENT_LIST_DIR}/src/**/*_sme.cpp" ) +file(GLOB KLEIDICV_SME2_SOURCES + "${CMAKE_CURRENT_LIST_DIR}/src/*_sme2.cpp" + "${CMAKE_CURRENT_LIST_DIR}/src/**/*_sme2.cpp" +) + set(KLEIDICV_INCLUDE_DIRS "${CMAKE_CURRENT_LIST_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" diff --git a/kleidicv/include/kleidicv/config.h.in b/kleidicv/include/kleidicv/config.h.in index 7666c24fee1e488324464004e2b045ffa6489092..4d71bf393a25d4d20e9a46ab8210da99ff3d713c 100644 --- a/kleidicv/include/kleidicv/config.h.in +++ b/kleidicv/include/kleidicv/config.h.in @@ -27,6 +27,8 @@ #cmakedefine01 KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS +#cmakedefine01 KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 + // Set to '1' if compiling NEON code paths, otherwise it is set to '0'. #ifndef KLEIDICV_TARGET_NEON #define KLEIDICV_TARGET_NEON 0 @@ -67,7 +69,7 @@ #undef KLEIDICV_ASSUME_128BIT_SVE2 #define KLEIDICV_ASSUME_128BIT_SVE2 0 #define KLEIDICV_LOCALLY_STREAMING __arm_locally_streaming -#define KLEIDICV_STREAMING_COMPATIBLE __arm_streaming_compatible +#define KLEIDICV_STREAMING __arm_streaming #if KLEIDICV_TARGET_SME #define KLEIDICV_TARGET_FN_ATTRS KLEIDICV_ATTR_SECTION(".text.sme") @@ -83,7 +85,7 @@ #else #define KLEIDICV_LOCALLY_STREAMING -#define KLEIDICV_STREAMING_COMPATIBLE +#define KLEIDICV_STREAMING #endif #ifdef __linux__ diff --git a/kleidicv/include/kleidicv/filters/filter_2d_neon.h b/kleidicv/include/kleidicv/filters/filter_2d_neon.h index f700a063d2f419c71c86d08922ea7c5043013cbb..e9132fa09cf01eb23658cff30ccb8098fbe4b867 100644 --- a/kleidicv/include/kleidicv/filters/filter_2d_neon.h +++ b/kleidicv/include/kleidicv/filters/filter_2d_neon.h @@ -94,7 +94,7 @@ class Filter2d { void process_one_pixel_with_horizontal_borders( Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets, - BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { disable_loop_vectorization(); process_one_element_with_horizontal_borders( @@ -119,16 +119,15 @@ class Filter2d { void process_one_element_with_horizontal_borders( Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + size_t index) const KLEIDICV_STREAMING { SourceType src[KSize][KSize]; - auto KernelWindow = [&](size_t row, size_t col) - KLEIDICV_STREAMING_COMPATIBLE -> SourceType& { - return src[row][col]; - }; + auto KernelWindow = + [&](size_t row, size_t col) + KLEIDICV_STREAMING -> SourceType& { return src[row][col]; }; auto load_array_element = [&](const SourceType& x) - KLEIDICV_STREAMING_COMPATIBLE { return x; }; + KLEIDICV_STREAMING { return x; }; WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, window_row_offsets, window_col_offsets, diff --git a/kleidicv/include/kleidicv/filters/filter_2d_sc.h b/kleidicv/include/kleidicv/filters/filter_2d_sc.h index 2783849eb3fc93d6409fb8d26405ab315f3862c7..707a1fc5803a7900fa3bf5b9b33543bdf192a94e 100644 --- a/kleidicv/include/kleidicv/filters/filter_2d_sc.h +++ b/kleidicv/include/kleidicv/filters/filter_2d_sc.h @@ -24,8 +24,7 @@ class Filter2D3x3VectorOperations { static void process_one_element_with_vector_operation( svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index, - const InnerFilterType& filter_) KLEIDICV_STREAMING_COMPATIBLE { + size_t index, const InnerFilterType& filter_) KLEIDICV_STREAMING { SourceVectorType src_0_0, src_0_1, src_0_2, src_1_0, src_1_1, src_1_2, src_2_0, src_2_1, src_2_2, dst_vec; ScalableVectorArray2D KernelWindow = {{ @@ -34,9 +33,8 @@ class Filter2D3x3VectorOperations { {std::ref(src_2_0), std::ref(src_2_1), std::ref(src_2_2)}, }}; - auto load_array_element = - [&](const SourceType& x) - KLEIDICV_STREAMING_COMPATIBLE { return svld1(pg, &x); }; + auto load_array_element = [&](const SourceType& x) + KLEIDICV_STREAMING { return svld1(pg, &x); }; WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, window_row_offsets, window_col_offsets, @@ -51,7 +49,7 @@ class Filter2D3x3VectorOperations { svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, BorderOffsets window_col_offsets, size_t index, - const InnerFilterType& filter_) KLEIDICV_STREAMING_COMPATIBLE { + const InnerFilterType& filter_) KLEIDICV_STREAMING { SourceVectorType src_0_0, src_0_1, src_0_2, src_1_0, src_1_1, src_1_2, src_2_0, src_2_1, src_2_2, src_3_0, src_3_1, src_3_2, dst_vec_0, dst_vec_1; @@ -63,9 +61,8 @@ class Filter2D3x3VectorOperations { {std::ref(src_3_0), std::ref(src_3_1), std::ref(src_3_2)}, }}; - auto load_array_element = - [&](const SourceType& x) - KLEIDICV_STREAMING_COMPATIBLE { return svld1(pg, &x); }; + auto load_array_element = [&](const SourceType& x) + KLEIDICV_STREAMING { return svld1(pg, &x); }; WindowLoaderType::load_window_to_handle_dual_rows( KernelWindow, load_array_element, src_rows, window_row_offsets_0, @@ -90,8 +87,7 @@ class Filter2D5x5VectorOperations { static void process_one_element_with_vector_operation( svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index, - const InnerFilterType& filter_) KLEIDICV_STREAMING_COMPATIBLE { + size_t index, const InnerFilterType& filter_) KLEIDICV_STREAMING { SourceVectorType src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_2_0, src_2_1, src_2_2, src_2_3, src_2_4, src_3_0, src_3_1, src_3_2, src_3_3, src_3_4, src_4_0, src_4_1, @@ -111,9 +107,8 @@ class Filter2D5x5VectorOperations { std::ref(src_4_3), std::ref(src_4_4)}, }}; - auto load_array_element = - [&](const SourceType& x) - KLEIDICV_STREAMING_COMPATIBLE { return svld1(pg, &x); }; + auto load_array_element = [&](const SourceType& x) + KLEIDICV_STREAMING { return svld1(pg, &x); }; WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, window_row_offsets, window_col_offsets, @@ -134,8 +129,7 @@ class Filter2D7x7VectorOperations { static void process_one_element_with_vector_operation( svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index, - const InnerFilterType& filter_) KLEIDICV_STREAMING_COMPATIBLE { + size_t index, const InnerFilterType& filter_) KLEIDICV_STREAMING { SourceVectorType src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, src_2_0, src_2_1, src_2_2, src_2_3, src_2_4, src_2_5, src_2_6, src_3_0, @@ -169,9 +163,8 @@ class Filter2D7x7VectorOperations { std::ref(src_6_6)}, }}; - auto load_array_element = - [&](const SourceType& x) - KLEIDICV_STREAMING_COMPATIBLE { return svld1(pg, &x); }; + auto load_array_element = [&](const SourceType& x) + KLEIDICV_STREAMING { return svld1(pg, &x); }; WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, window_row_offsets, window_col_offsets, @@ -196,16 +189,16 @@ class Filter2d { using BorderOffsets = typename BorderInfoType::Offsets; // using Base = VectorOperationProviderType; static constexpr size_t kMargin = KSize / 2UL; - explicit Filter2d(InnerFilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit Filter2d(InnerFilterType filter) KLEIDICV_STREAMING : filter_{filter} {} void process_pixels_without_horizontal_borders( size_t width, Rows src_rows, Rows dst_rows, BorderOffsets window_row_offsets, - BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg = SourceVecTraits::svptrue(); VectorOperationProviderType:: template process_one_element_with_vector_operation src_rows, Rows dst_rows, BorderOffsets window_row_offsets, - BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { VectorOperationProviderType:: template process_one_element_with_vector_operation src_rows, Rows dst_rows, BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, - BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg = SourceVecTraits::svptrue(); VectorOperationProviderType:: template process_two_elements_with_vector_operation src_rows, Rows dst_rows, BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, - BorderOffsets window_col_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { VectorOperationProviderType:: template process_two_elements_with_vector_operation src_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index) KLEIDICV_STREAMING_COMPATIBLE { + size_t index) KLEIDICV_STREAMING { KernelWindow(0, 0) = load_array_element( src_rows.at(window_row_offsets.c0(), window_col_offsets.c0())[index]); KernelWindow(0, 1) = load_array_element( @@ -51,7 +51,7 @@ class Filter2dWindowLoader3x3 { LoadArrayElementFunctionType load_array_element, Rows src_rows, BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, BorderOffsets window_col_offsets, - size_t index) KLEIDICV_STREAMING_COMPATIBLE { + size_t index) KLEIDICV_STREAMING { load_window(KernelWindow, load_array_element, src_rows, window_row_offsets_0, window_col_offsets, index); diff --git a/kleidicv/include/kleidicv/filters/filter_2d_window_loader_5x5.h b/kleidicv/include/kleidicv/filters/filter_2d_window_loader_5x5.h index 94820a0c65d70b4594b4c89fabbec9baa584ef00..56aa5d2f4cc9b9e8947667301c338556da0c84f8 100644 --- a/kleidicv/include/kleidicv/filters/filter_2d_window_loader_5x5.h +++ b/kleidicv/include/kleidicv/filters/filter_2d_window_loader_5x5.h @@ -21,7 +21,7 @@ class Filter2dWindowLoader5x5 { Rows src_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index) KLEIDICV_STREAMING_COMPATIBLE { + size_t index) KLEIDICV_STREAMING { KernelWindow(0, 0) = load_array_element( src_rows.at(window_row_offsets.c0(), window_col_offsets.c0())[index]); KernelWindow(0, 1) = load_array_element( diff --git a/kleidicv/include/kleidicv/filters/filter_2d_window_loader_7x7.h b/kleidicv/include/kleidicv/filters/filter_2d_window_loader_7x7.h index f232f8fbdb32de9516169a0692369f9311b71568..dfd5a8ceac2ee7e73b560a11c2c0b29d0afa6d69 100644 --- a/kleidicv/include/kleidicv/filters/filter_2d_window_loader_7x7.h +++ b/kleidicv/include/kleidicv/filters/filter_2d_window_loader_7x7.h @@ -21,7 +21,7 @@ class Filter2dWindowLoader7x7 { Rows src_rows, BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, - size_t index) KLEIDICV_STREAMING_COMPATIBLE { + size_t index) KLEIDICV_STREAMING { // first row KernelWindow(0, 0) = load_array_element( src_rows.at(window_row_offsets.c0(), window_col_offsets.c0())[index]); diff --git a/kleidicv/include/kleidicv/filters/gaussian_blur.h b/kleidicv/include/kleidicv/filters/gaussian_blur.h index e801fd8d1f4fa49765e87ebd71044843d4ec5628..5c0bbc2c48eb5b21a57079755d495dda7b066c27 100644 --- a/kleidicv/include/kleidicv/filters/gaussian_blur.h +++ b/kleidicv/include/kleidicv/filters/gaussian_blur.h @@ -90,7 +90,7 @@ kleidicv_error_t gaussian_blur_checks( const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, size_t height, size_t channels, const KLEIDICV_TARGET_NAMESPACE::SeparableFilterWorkspace *workspace) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { CHECK_POINTERS(workspace); CHECK_POINTER_AND_STRIDE(src, src_stride, height); diff --git a/kleidicv/include/kleidicv/filters/process_fitler_2d.h b/kleidicv/include/kleidicv/filters/process_fitler_2d.h index 0c8f1a76672d37aab669bc872f047867509ae837..2d44364f199a4c35029ad63ae8cac731d9b1a8ef 100644 --- a/kleidicv/include/kleidicv/filters/process_fitler_2d.h +++ b/kleidicv/include/kleidicv/filters/process_fitler_2d.h @@ -20,7 +20,7 @@ void process_filter2d(Rectangle rect, size_t y_begin, size_t y_end, Rows src_rows, Rows dst_rows, typename FilterType::BorderType border_type, - FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + FilterType filter) KLEIDICV_STREAMING { // Border helper which calculates border offsets. typename FilterType::BorderInfoType vertical_border{rect.height(), border_type}; @@ -73,7 +73,7 @@ void process_filter2d_by_dual_rows( Rows src_rows, Rows dst_rows, typename FilterType::BorderType border_type, - FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + FilterType filter) KLEIDICV_STREAMING { // Border helper which calculates border offsets. typename FilterType::BorderInfoType vertical_border{rect.height(), border_type}; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index 1ff29d18a318ac43b285a63865ebb5e0ceb3c78e..d0306cc6e2a801a7f4c9c0d763a11dd30112f094 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -33,56 +33,54 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING : filter_{filter} {} static constexpr size_t margin = 7UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } // Processing of horizontal borders is always scalar because border offsets // change for each and every element in the border. void process_horizontal_borders( Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { disable_loop_vectorization(); process_horizontal_border(src_rows, dst_rows, border_offsets, index); @@ -93,7 +91,7 @@ class SeparableFilter { void vertical_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + size_t index) const KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = @@ -131,10 +129,10 @@ class SeparableFilter { filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; @@ -197,8 +195,8 @@ class SeparableFilter { void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferVectorType src_0 = svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = @@ -235,10 +233,10 @@ class SeparableFilter { filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void process_horizontal_border(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferType src[15]; src[0] = src_rows.at(0, border_offsets.c0())[index]; src[1] = src_rows.at(0, border_offsets.c1())[index]; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h index 73708ef70a30634a3612e2245f47839c6a5c8d02..f705e9ba1484512f99d6450a83ffe2b666673663 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h @@ -33,56 +33,54 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING : filter_{filter} {} static constexpr size_t margin = 10UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } // Processing of horizontal borders is always scalar because border offsets // change for each and every element in the border. void process_horizontal_borders( Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { disable_loop_vectorization(); process_horizontal_border(src_rows, dst_rows, border_offsets, index); @@ -93,7 +91,7 @@ class SeparableFilter { void vertical_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + size_t index) const KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = @@ -143,10 +141,10 @@ class SeparableFilter { filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; @@ -227,8 +225,8 @@ class SeparableFilter { void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferVectorType src_0 = svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = @@ -279,10 +277,10 @@ class SeparableFilter { filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void process_horizontal_border(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferType src[21]; src[0] = src_rows.at(0, border_offsets.c0())[index]; src[1] = src_rows.at(0, border_offsets.c1())[index]; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h index 31005a941df802f04e4ae51b0cac31656d615063..c7d247430a53747134ca4404c0fa6644a1560c28 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h @@ -33,56 +33,54 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING : filter_{filter} {} static constexpr size_t margin = 1UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } // Processing of horizontal borders is always scalar because border offsets // change for each and every element in the border. void process_horizontal_borders( Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { disable_loop_vectorization(); process_horizontal_border(src_rows, dst_rows, border_offsets, index); @@ -93,7 +91,7 @@ class SeparableFilter { void vertical_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + size_t index) const KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = @@ -104,10 +102,10 @@ class SeparableFilter { filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; @@ -130,8 +128,8 @@ class SeparableFilter { void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferVectorType src_0 = svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = @@ -143,10 +141,10 @@ class SeparableFilter { filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void process_horizontal_border(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferType src[3]; src[0] = src_rows.at(0, border_offsets.c0())[index]; src[1] = src_rows.at(0, border_offsets.c1())[index]; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h index ff0b719eef1a257d30a9c22ceb66467ad4e71e22..ae5352f7060bc3de1c9e32099af1545299350ec0 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h @@ -33,56 +33,54 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING : filter_{filter} {} static constexpr size_t margin = 2UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } // Processing of horizontal borders is always scalar because border offsets // change for each and every element in the border. void process_horizontal_borders( Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { disable_loop_vectorization(); process_horizontal_border(src_rows, dst_rows, border_offsets, index); @@ -93,7 +91,7 @@ class SeparableFilter { void vertical_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + size_t index) const KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = @@ -109,10 +107,10 @@ class SeparableFilter { filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; @@ -140,8 +138,8 @@ class SeparableFilter { void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferVectorType src_0 = svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = @@ -157,10 +155,10 @@ class SeparableFilter { filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void process_horizontal_border(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferType src[5]; src[0] = src_rows.at(0, border_offsets.c0())[index]; src[1] = src_rows.at(0, border_offsets.c1())[index]; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h index 05fb376a9e6d608ec9402545b60c98c01d5fc214..cfcd1b320c0c0de2c3fdec385c4c51d23cc87750 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h @@ -33,56 +33,54 @@ class SeparableFilter { using BorderType = FixedBorderType; using BorderOffsets = typename BorderInfoType::Offsets; - explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING_COMPATIBLE + explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING : filter_{filter} {} static constexpr size_t margin = 3UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining( - [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); - }); + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + }); } // Processing of horizontal borders is always scalar because border offsets // change for each and every element in the border. void process_horizontal_borders( Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets) const KLEIDICV_STREAMING { for (size_t index = 0; index < src_rows.channels(); ++index) { disable_loop_vectorization(); process_horizontal_border(src_rows, dst_rows, border_offsets, index); @@ -93,7 +91,7 @@ class SeparableFilter { void vertical_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + size_t index) const KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = @@ -113,10 +111,10 @@ class SeparableFilter { filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } - void horizontal_vector_path_2x( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { auto src_0 = &src_rows.at(0, border_offsets.c0())[index]; auto src_1 = &src_rows.at(0, border_offsets.c1())[index]; auto src_2 = &src_rows.at(0, border_offsets.c2())[index]; @@ -150,8 +148,8 @@ class SeparableFilter { void horizontal_vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, size_t index) const - KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferVectorType src_0 = svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = @@ -172,10 +170,10 @@ class SeparableFilter { filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } - void process_horizontal_border( - Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - size_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void process_horizontal_border(Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + size_t index) const KLEIDICV_STREAMING { BufferType src[7]; src[0] = src_rows.at(0, border_offsets.c0())[index]; src[1] = src_rows.at(0, border_offsets.c1())[index]; diff --git a/kleidicv/include/kleidicv/filters/sigma.h b/kleidicv/include/kleidicv/filters/sigma.h index 87ae9d2e83cc2d979f03f606952d757f40c9d2cb..686c7197191feca8d440be33f78009c4c6b27b1e 100644 --- a/kleidicv/include/kleidicv/filters/sigma.h +++ b/kleidicv/include/kleidicv/filters/sigma.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -14,7 +14,7 @@ namespace KLEIDICV_TARGET_NAMESPACE { static constexpr size_t get_half_kernel_size(size_t kernel_size) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { // since kernel sizes are odd, "half" here means that // the extra element is included return (kernel_size >> 1) + 1; diff --git a/kleidicv/include/kleidicv/morphology/workspace.h b/kleidicv/include/kleidicv/morphology/workspace.h index 47682842faf0e63d8aa47099105f3ef28382c8a1..8f072ed532850cbf9d3ebf845e2584c961ac13db 100644 --- a/kleidicv/include/kleidicv/morphology/workspace.h +++ b/kleidicv/include/kleidicv/morphology/workspace.h @@ -26,8 +26,7 @@ class MorphologyWorkspace; // Deleter for MorphologyWorkspace instances. class MorphologyWorkspaceDeleter { public: - void operator()(MorphologyWorkspace *ptr) const - KLEIDICV_STREAMING_COMPATIBLE { + void operator()(MorphologyWorkspace *ptr) const KLEIDICV_STREAMING { std::free(ptr); }; }; @@ -45,7 +44,7 @@ class MorphologyWorkspace final { }; static std::optional get_border_type( - kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_border_type_t border_type) KLEIDICV_STREAMING { switch (border_type) { case KLEIDICV_BORDER_TYPE_REPLICATE: return BorderType::REPLICATE; @@ -60,8 +59,7 @@ class MorphologyWorkspace final { class CopyDataMemcpy { public: constexpr void operator()(Rows src_rows, Rows dst_rows, - size_t length) const - KLEIDICV_STREAMING_COMPATIBLE { + size_t length) const KLEIDICV_STREAMING { #if KLEIDICV_TARGET_SME __arm_sc_memcpy(static_cast(&dst_rows[0]), static_cast(&src_rows[0]), @@ -82,7 +80,7 @@ class MorphologyWorkspace final { Pointer &workspace, kleidicv_rectangle_t kernel, kleidicv_point_t anchor, BorderType border_type, const uint8_t *border_value, size_t channels, size_t iterations, size_t type_size, - kleidicv_rectangle_t image) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_rectangle_t image) KLEIDICV_STREAMING { // These values are arbitrarily choosen. const size_t rows_per_iteration = std::max(2 * kernel.height, 32UL); // To avoid load/store penalties. @@ -180,8 +178,7 @@ class MorphologyWorkspace final { template void process(Rectangle rect, Rows src_rows, Rows dst_rows, Margin margin, - BorderType border_type, - O operation) KLEIDICV_STREAMING_COMPATIBLE { + BorderType border_type, O operation) KLEIDICV_STREAMING { using S = typename O::SourceType; using B = typename O::BufferType; typename O::CopyData copy_data{}; @@ -325,16 +322,14 @@ class MorphologyWorkspace final { private: // The number of wide rows to process in the next iteration. - [[nodiscard]] size_t get_next_horizontal_height() - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] size_t get_next_horizontal_height() KLEIDICV_STREAMING { size_t height = std::min(horizontal_height_, rows_per_iteration_); horizontal_height_ -= height; return height; } // The number of indirect rows to process in the next iteration. - [[nodiscard]] size_t get_next_vertical_height() - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] size_t get_next_vertical_height() KLEIDICV_STREAMING { size_t height = std::min(vertical_height_, rows_per_iteration_); vertical_height_ -= height; return height; @@ -342,7 +337,7 @@ class MorphologyWorkspace final { template void make_constant_border(Rows dst_rows, size_t dst_index, - size_t count) KLEIDICV_STREAMING_COMPATIBLE { + size_t count) KLEIDICV_STREAMING { auto dst = &dst_rows.at(0, dst_index)[0]; for (size_t index = 0; index < count; ++index) { for (size_t channel = 0; channel < dst_rows.channels(); ++channel) { @@ -354,7 +349,7 @@ class MorphologyWorkspace final { template void replicate_border(Rows src_rows, Rows dst_rows, size_t src_index, size_t dst_index, - size_t count) KLEIDICV_STREAMING_COMPATIBLE { + size_t count) KLEIDICV_STREAMING { if (!count) { return; } diff --git a/kleidicv/include/kleidicv/operations.h b/kleidicv/include/kleidicv/operations.h index d6c736c5ed30975d5d50e0a6a09a27c3d6fc247e..2f17195bc5aa1f27f5772b8229ca37eff4e4513e 100644 --- a/kleidicv/include/kleidicv/operations.h +++ b/kleidicv/include/kleidicv/operations.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -31,108 +31,101 @@ class OperationBase { using ContextType = context_type_t; // Returns a reference to the inner operation. - OperationType &operation() KLEIDICV_STREAMING_COMPATIBLE { - return operation_; - } + OperationType &operation() KLEIDICV_STREAMING { return operation_; } // Forwards num_lanes() calls to the inner operation. - static size_t num_lanes() KLEIDICV_STREAMING_COMPATIBLE { + static size_t num_lanes() KLEIDICV_STREAMING { return VecTraits::num_lanes(); } // Forwards vector_path_2x() calls to the inner operation. template - decltype(auto) vector_path_2x(ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) vector_path_2x(ArgTypes &&...args) KLEIDICV_STREAMING { return operation().vector_path_2x(std::forward(args)...); } // Forwards vector_path() calls to the inner operation. template - decltype(auto) vector_path(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) vector_path(ArgTypes &&...args) KLEIDICV_STREAMING { return operation().vector_path(std::forward(args)...); } // Forwards remaining_path() calls to the inner operation. template - decltype(auto) remaining_path(ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) remaining_path(ArgTypes &&...args) KLEIDICV_STREAMING { return operation().remaining_path(std::forward(args)...); } // Forwards tail_path() calls to the inner operation. template - decltype(auto) tail_path(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) tail_path(ArgTypes &&...args) KLEIDICV_STREAMING { return operation().tail_path(std::forward(args)...); } // Forwards scalar_path() calls to the inner operation. template - decltype(auto) scalar_path(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) scalar_path(ArgTypes &&...args) KLEIDICV_STREAMING { return operation().scalar_path(std::forward(args)...); } template - decltype(auto) load(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) load(ArgTypes &&...args) KLEIDICV_STREAMING { return VecTraits::load(std::forward(args)...); } template - decltype(auto) load_consecutive(ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) load_consecutive(ArgTypes &&...args) KLEIDICV_STREAMING { return VecTraits::load_consecutive(std::forward(args)...); } template - decltype(auto) store(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) store(ArgTypes &&...args) KLEIDICV_STREAMING { return VecTraits::store(std::forward(args)...); } template - decltype(auto) store_consecutive(ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) store_consecutive(ArgTypes &&...args) KLEIDICV_STREAMING { return VecTraits::store_consecutive(std::forward(args)...); } // Forwards max_vectors_per_block() calls to the inner operation. - size_t max_vectors_per_block() KLEIDICV_STREAMING_COMPATIBLE { + size_t max_vectors_per_block() KLEIDICV_STREAMING { return operation_.max_vectors_per_block(); } // Forwards on_block_finished() calls to the inner operation. - void on_block_finished(size_t vectors_in_block) - KLEIDICV_STREAMING_COMPATIBLE { + void on_block_finished(size_t vectors_in_block) KLEIDICV_STREAMING { return operation_.on_block_finished(vectors_in_block); } // Returns true if the innermost operation is unrolled twice, otherwise false. - static constexpr bool is_unrolled_twice() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr bool is_unrolled_twice() KLEIDICV_STREAMING { return ::KLEIDICV_TARGET_NAMESPACE::is_unrolled_twice< concrete_operation_type_t>; } // Returns true if the innermost operation is unrolled once, otherwise false. - static constexpr bool is_unrolled_once() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr bool is_unrolled_once() KLEIDICV_STREAMING { return ::KLEIDICV_TARGET_NAMESPACE::is_unrolled_once< concrete_operation_type_t>; } // Returns true if the innermost operation uses tail path, otherwise false. - static constexpr bool uses_tail_path() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr bool uses_tail_path() KLEIDICV_STREAMING { return ::KLEIDICV_TARGET_NAMESPACE::uses_tail_path< concrete_operation_type_t>; } // Returns true if the innermost operation tries to avoid tail loop, otherwise // false. - static constexpr bool try_to_avoid_tail_loop() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr bool try_to_avoid_tail_loop() KLEIDICV_STREAMING { return ::KLEIDICV_TARGET_NAMESPACE::try_to_avoid_tail_loop< concrete_operation_type_t>; } protected: // Constructor is protected so that only derived classes can instantiate. - explicit OperationBase(OperationType &operation) KLEIDICV_STREAMING_COMPATIBLE + explicit OperationBase(OperationType &operation) KLEIDICV_STREAMING : operation_{operation} {} private: @@ -144,8 +137,8 @@ class OperationBase { template class ForwardingOperation : public OperationBase { public: - explicit ForwardingOperation(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit ForwardingOperation(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} }; // end of class ForwardingOperation // Facade to offer a simplified row-based operation interface. @@ -160,19 +153,18 @@ class ForwardingOperation : public OperationBase { template class RowBasedOperation : public OperationBase { public: - explicit RowBasedOperation(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit RowBasedOperation(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} // NOLINTBEGIN(cppcoreguidelines-avoid-goto, hicpp-avoid-goto) template - void process_row(size_t length, - ColumnTypes... columns) KLEIDICV_STREAMING_COMPATIBLE { + void process_row(size_t length, ColumnTypes... columns) KLEIDICV_STREAMING { LoopUnroll loop{length, this->num_lanes()}; // clang-format off loop.unroll_twice_if( - [&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + [&](size_t step) KLEIDICV_STREAMING { this->operation().vector_path_2x(columns...); ((columns += step), ...); }); @@ -180,7 +172,7 @@ class RowBasedOperation : public OperationBase { avoid_tail_loop: loop.unroll_once_if( - [&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + [&](size_t step) KLEIDICV_STREAMING { this->operation().vector_path(columns...); ((columns += step), ...); }); @@ -189,7 +181,7 @@ class RowBasedOperation : public OperationBase { // possible. if constexpr (OperationType::is_unrolled_once() && OperationType::try_to_avoid_tail_loop()) { if (loop.try_avoid_tail_loop( - [&](size_t backward_step) KLEIDICV_STREAMING_COMPATIBLE { + [&](size_t backward_step) KLEIDICV_STREAMING { // Adjust pointers backwards to include // the leftover bytes. ((columns -= backward_step), ...); @@ -199,7 +191,7 @@ class RowBasedOperation : public OperationBase { } loop.remaining( - [&](size_t length, size_t /* step */) KLEIDICV_STREAMING_COMPATIBLE { + [&](size_t length, size_t /* step */) KLEIDICV_STREAMING { this->operation().remaining_path(length, columns...); }); @@ -223,21 +215,20 @@ class RowBasedOperation : public OperationBase { template class RowBasedBlockOperation : public OperationBase { public: - explicit RowBasedBlockOperation(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit RowBasedBlockOperation(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} template - void process_row(size_t length, - ColumnTypes... columns) KLEIDICV_STREAMING_COMPATIBLE { + void process_row(size_t length, ColumnTypes... columns) KLEIDICV_STREAMING { if constexpr (OperationType::is_unrolled_twice()) { - process_blocks<2>(length, [&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + process_blocks<2>(length, [&](size_t step) KLEIDICV_STREAMING { this->operation().vector_path_2x(columns...); ((columns += step), ...); }); } if constexpr (OperationType::is_unrolled_once()) { - process_blocks<1>(length, [&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + process_blocks<1>(length, [&](size_t step) KLEIDICV_STREAMING { this->operation().vector_path(columns...); ((columns += step), ...); }); @@ -246,7 +237,7 @@ class RowBasedBlockOperation : public OperationBase { // clang-format off LoopUnroll loop{length, this->num_lanes()}; loop.remaining( - [&](size_t length, size_t /* step */) KLEIDICV_STREAMING_COMPATIBLE { + [&](size_t length, size_t /* step */) KLEIDICV_STREAMING { this->operation().remaining_path(length, columns...); }); // clang-format on @@ -255,7 +246,7 @@ class RowBasedBlockOperation : public OperationBase { private: template void process_blocks(size_t &length, - CallbackType callback) KLEIDICV_STREAMING_COMPATIBLE { + CallbackType callback) KLEIDICV_STREAMING { // The number of elements a single iteration would process. const size_t elements_per_iteration = UnrollFactor * this->num_lanes(); // The number of elements which will be processed when this method returns. @@ -280,7 +271,7 @@ class RowBasedBlockOperation : public OperationBase { // Process data with the appropriate unroll factor. LoopUnroll loop{block_length, this->num_lanes()}; loop.unroll_n_times( - [&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + [&](size_t step) KLEIDICV_STREAMING { callback(step); // Adjust remaining length here. // This improves generated code. @@ -305,13 +296,13 @@ class ParallelRowsAdapter : public OperationBase { using ParallelColumnType = ParallelColumns; using ConstColumnType = Columns; - explicit ParallelRowsAdapter(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit ParallelRowsAdapter(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} // Forwards vector_path_2x() calls to the inner operation with one source and // destination parallel columns. void vector_path_2x(ConstParallelColumnType src_a, ConstColumnType src_b, - ParallelColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ParallelColumnType dst) KLEIDICV_STREAMING { this->operation().vector_path_2x(src_a.first(), src_a.second(), src_b, dst.first(), dst.second()); } @@ -319,7 +310,7 @@ class ParallelRowsAdapter : public OperationBase { // Forwards vector_path() calls to the inner operation with one source and // destination parallel columns. void vector_path(ConstParallelColumnType src_a, ConstColumnType src_b, - ParallelColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ParallelColumnType dst) KLEIDICV_STREAMING { this->operation().vector_path(src_a.first(), src_a.second(), src_b, dst.first(), dst.second()); } @@ -328,7 +319,7 @@ class ParallelRowsAdapter : public OperationBase { // destination parallel columns. void remaining_path(size_t length, ConstParallelColumnType src_a, ConstColumnType src_b, - ParallelColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ParallelColumnType dst) KLEIDICV_STREAMING { this->operation().remaining_path(length, src_a.first(), src_a.second(), src_b, dst.first(), dst.second()); } @@ -367,8 +358,8 @@ class OperationAdapter : public OperationBase { using ConstColumnType = Columns; using ColumnType = Columns; - explicit OperationAdapter(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit OperationAdapter(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} // --------------------------------------------------------------------------- // Forwarding implementations for vector_path_2x(). @@ -378,7 +369,7 @@ class OperationAdapter : public OperationBase { // void T::vector_path([ContextType,] VectorType); template enable_if_has_vector_path_t vector_path_2x( - ContextType ctx, ConstColumnType src) KLEIDICV_STREAMING_COMPATIBLE { + ContextType ctx, ConstColumnType src) KLEIDICV_STREAMING { VectorType src_0, src_1; operation().load_consecutive(ctx, &src[0], src_0, src_1); operation().vector_path(ctx, src_0); @@ -390,7 +381,7 @@ class OperationAdapter : public OperationBase { template enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_0, src_1; operation().load_consecutive(ctx, &src[0], src_0, src_1); VectorType res_0 = operation().vector_path(ctx, src_0); @@ -404,7 +395,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_a_1, src_b_0, src_b_1; operation().load_consecutive(ctx, &src_a[0], src_a_0, src_a_1); operation().load_consecutive(ctx, &src_b[0], src_b_0, src_b_1); @@ -418,7 +409,7 @@ class OperationAdapter : public OperationBase { template enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_0, src_1; operation().load_consecutive(ctx, &src[0], src_0, src_1); operation().vector_path(ctx, src_0, &dst[0]); @@ -431,7 +422,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_a_1, src_b_0, src_b_1; operation().load_consecutive(ctx, &src_a[0], src_a_0, src_a_1); operation().load_consecutive(ctx, &src_b[0], src_b_0, src_b_1); @@ -446,8 +437,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ConstColumnType src_c, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ConstColumnType src_c, ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_a_1, src_b_0, src_b_1, src_c_0, src_c_1; operation().load_consecutive(ctx, &src_a[0], src_a_0, src_a_1); operation().load_consecutive(ctx, &src_b[0], src_b_0, src_b_1); @@ -466,7 +456,7 @@ class OperationAdapter : public OperationBase { VectorType, ScalarType *, ScalarType *> vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { VectorType src_a_0, src_a_1, src_b_0, src_b_1, src_c_0, src_c_1; operation().load_consecutive(ctx, &src_a[0], src_a_0, src_a_1); operation().load_consecutive(ctx, &src_b[0], src_b_0, src_b_1); @@ -487,7 +477,7 @@ class OperationAdapter : public OperationBase { VectorType, VectorType, ScalarType *> vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ConstColumnType src_d, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_a_1, src_b_0, src_b_1; VectorType src_c_0, src_c_1, src_d_0, src_d_1; operation().load_consecutive(ctx, &src_a[0], src_a_0, src_a_1); @@ -505,7 +495,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst[0]); operation().vector_path(ctx, &src.at(num_lanes())[0], &dst.at(num_lanes())[0]); @@ -517,7 +507,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().vector_path(ctx, &src_a[0], &src_b[0], &dst[0]); operation().vector_path(ctx, &src_a.at(num_lanes())[0], &src_b.at(num_lanes())[0], &dst.at(num_lanes())[0]); @@ -532,7 +522,7 @@ class OperationAdapter : public OperationBase { ScalarType *, ScalarType *> vector_path_2x(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { operation().vector_path(ctx, &src_a[0], &src_b[0], &src_c[0], &dst_a[0], &dst_b[0]); operation().vector_path( @@ -548,7 +538,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst_a[0], &dst_b[0]); operation().vector_path(ctx, &src.at(num_lanes())[0], &dst_a.at(num_lanes())[0], @@ -562,7 +552,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { Vector2Type vdst_a, vdst_b; operation().vector_path(ctx, &src[0], vdst_a.val[0], vdst_b.val[0]); @@ -580,7 +570,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { Vector2Type vsrc_0, vsrc_1; Vector2Type vdst_a, vdst_b; @@ -600,8 +590,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst_a[0], &dst_b[0], &dst_c[0]); operation().vector_path( ctx, &src.at(num_lanes() * 3)[0], &dst_a.at(num_lanes())[0], @@ -615,8 +604,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { Vector2Type vdst_a, vdst_b, vdst_c; operation().vector_path(ctx, &src[0], vdst_a.val[0], vdst_b.val[0], @@ -636,8 +624,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { Vector3Type vsrc_0, vsrc_1; Vector2Type vdst_a, vdst_b, vdst_c; @@ -662,7 +649,7 @@ class OperationAdapter : public OperationBase { ScalarType *> vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst_a[0], &dst_b[0], &dst_c[0], &dst_d[0]); operation().vector_path( @@ -683,7 +670,7 @@ class OperationAdapter : public OperationBase { VectorType &> vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { Vector2Type vdst_a, vdst_b, vdst_c, vdst_d; operation().vector_path(ctx, &src[0], vdst_a.val[0], vdst_b.val[0], @@ -711,7 +698,7 @@ class OperationAdapter : public OperationBase { VectorType &> vector_path_2x(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { Vector4Type vsrc_0, vsrc_1; Vector2Type vdst_a, vdst_b, vdst_c, vdst_d; @@ -737,7 +724,7 @@ class OperationAdapter : public OperationBase { // void T::vector_path([ContextType,] VectorType); template enable_if_has_vector_path_t vector_path( - ContextType ctx, ConstColumnType src) KLEIDICV_STREAMING_COMPATIBLE { + ContextType ctx, ConstColumnType src) KLEIDICV_STREAMING { VectorType src_0; operation().load(ctx, &src[0], src_0); operation().vector_path(ctx, src_0); @@ -748,7 +735,7 @@ class OperationAdapter : public OperationBase { template enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_0; operation().load(ctx, &src[0], src_0); VectorType res_0 = operation().vector_path(ctx, src_0); @@ -761,7 +748,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_b_0; operation().load(ctx, &src_a[0], src_a_0); operation().load(ctx, &src_b[0], src_b_0); @@ -774,7 +761,7 @@ class OperationAdapter : public OperationBase { template enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_0; operation().load(ctx, &src[0], src_0); operation().vector_path(ctx, src_0, &dst[0]); @@ -786,7 +773,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_b_0; operation().load(ctx, &src_a[0], src_a_0); operation().load(ctx, &src_b[0], src_b_0); @@ -800,8 +787,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ConstColumnType src_c, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ConstColumnType src_c, ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_b_0, src_c_0; operation().load(ctx, &src_a[0], src_a_0); operation().load(ctx, &src_b[0], src_b_0); @@ -817,7 +803,7 @@ class OperationAdapter : public OperationBase { VectorType, ScalarType *, ScalarType *> vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { VectorType src_a_0, src_b_0, src_c_0; operation().load(ctx, &src_a[0], src_a_0); operation().load(ctx, &src_b[0], src_b_0); @@ -834,7 +820,7 @@ class OperationAdapter : public OperationBase { VectorType, VectorType, ScalarType *> vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ConstColumnType src_d, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_a_0, src_b_0, src_c_0, src_d_0; operation().load(ctx, &src_a[0], src_a_0); operation().load(ctx, &src_b[0], src_b_0); @@ -849,7 +835,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst[0]); } @@ -860,7 +846,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().vector_path(ctx, &src_a[0], &src_b[0], &dst[0]); } @@ -873,7 +859,7 @@ class OperationAdapter : public OperationBase { ScalarType *, ScalarType *> vector_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { operation().vector_path(ctx, &src_a[0], &src_b[0], &src_c[0], &dst_a[0], &dst_b[0]); } @@ -885,7 +871,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst_a[0], &dst_b[0]); } @@ -896,7 +882,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { VectorType vdst_a, vdst_b; operation().vector_path(ctx, &src[0], vdst_a, vdst_b); @@ -912,7 +898,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { VectorType vdst_a, vdst_b; Vector2Type vsrc; @@ -931,8 +917,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst_a[0], &dst_b[0], &dst_c[0]); } @@ -943,8 +928,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { VectorType vdst_a, vdst_b, vdst_c; operation().vector_path(ctx, &src[0], vdst_a, vdst_b, vdst_c); @@ -961,8 +945,7 @@ class OperationAdapter : public OperationBase { enable_if_has_vector_path_t vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { VectorType vdst_a, vdst_b, vdst_c; Vector3Type vsrc; @@ -985,7 +968,7 @@ class OperationAdapter : public OperationBase { ScalarType *> vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { operation().vector_path(ctx, &src[0], &dst_a[0], &dst_b[0], &dst_c[0], &dst_d[0]); } @@ -1001,7 +984,7 @@ class OperationAdapter : public OperationBase { VectorType &> vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { VectorType vdst_a, vdst_b, vdst_c, vdst_d; operation().vector_path(ctx, &src[0], vdst_a, vdst_b, vdst_c, vdst_d); @@ -1023,7 +1006,7 @@ class OperationAdapter : public OperationBase { VectorType &> vector_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { VectorType vdst_a, vdst_b, vdst_c, vdst_d; Vector4Type vsrc; @@ -1046,7 +1029,7 @@ class OperationAdapter : public OperationBase { template enable_if_has_tail_path_t tail_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { VectorType src_0; operation().load(ctx, &src[0], src_0); operation().tail_path(ctx, src_0, &dst[0]); @@ -1058,7 +1041,7 @@ class OperationAdapter : public OperationBase { enable_if_has_tail_path_t tail_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().tail_path(ctx, &src[0], &dst[0]); } @@ -1070,7 +1053,7 @@ class OperationAdapter : public OperationBase { // void T::scalar_path([ContextType,] ScalarType); template enable_if_has_scalar_path_t scalar_path( - ContextType ctx, ConstColumnType src) KLEIDICV_STREAMING_COMPATIBLE { + ContextType ctx, ConstColumnType src) KLEIDICV_STREAMING { operation().scalar_path(ctx, src[0]); } @@ -1079,7 +1062,7 @@ class OperationAdapter : public OperationBase { template enable_if_has_scalar_path_t scalar_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { dst[0] = operation().scalar_path(ctx, src[0]); } @@ -1089,7 +1072,7 @@ class OperationAdapter : public OperationBase { enable_if_has_scalar_path_t scalar_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { dst[0] = operation().scalar_path(ctx, src_a[0], src_b[0]); } @@ -1099,7 +1082,7 @@ class OperationAdapter : public OperationBase { enable_if_has_scalar_path_t scalar_path(ContextType ctx, ConstColumnType src, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src[0], &dst[0]); } @@ -1110,7 +1093,7 @@ class OperationAdapter : public OperationBase { enable_if_has_scalar_path_t scalar_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src_a[0], &src_b[0], &dst[0]); } @@ -1122,8 +1105,7 @@ class OperationAdapter : public OperationBase { const ScalarType *, const ScalarType *, ScalarType *> scalar_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, - ConstColumnType src_c, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ConstColumnType src_c, ColumnType dst) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src_a[0], &src_b[0], &src_c[0], &dst[0]); } @@ -1136,7 +1118,7 @@ class OperationAdapter : public OperationBase { const ScalarType *, ScalarType *> scalar_path(ContextType ctx, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ConstColumnType src_d, - ColumnType dst) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src_a[0], &src_b[0], &src_c[0], &src_d[0], &dst[0]); } @@ -1151,7 +1133,7 @@ class OperationAdapter : public OperationBase { ScalarType *, ScalarType *> scalar_path(ContextType ctx, size_t length, ConstColumnType src_a, ConstColumnType src_b, ConstColumnType src_c, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { operation().scalar_path(ctx, length, &src_a[0], &src_b[0], &src_c[0], &dst_a[0], &dst_b[0]); } @@ -1163,7 +1145,7 @@ class OperationAdapter : public OperationBase { enable_if_has_scalar_path_t scalar_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src[0], &dst_a[0], &dst_b[0]); } @@ -1174,8 +1156,7 @@ class OperationAdapter : public OperationBase { enable_if_has_scalar_path_t scalar_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, - ColumnType dst_b, - ColumnType dst_c) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_b, ColumnType dst_c) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src[0], &dst_a[0], &dst_b[0], &dst_c[0]); } @@ -1188,7 +1169,7 @@ class OperationAdapter : public OperationBase { ScalarType *> scalar_path(ContextType ctx, ConstColumnType src, ColumnType dst_a, ColumnType dst_b, ColumnType dst_c, - ColumnType dst_d) KLEIDICV_STREAMING_COMPATIBLE { + ColumnType dst_d) KLEIDICV_STREAMING { operation().scalar_path(ctx, &src[0], &dst_a[0], &dst_b[0], &dst_c[0], &dst_d[0]); } @@ -1200,57 +1181,54 @@ class RemoveContextAdapter : public OperationBase { public: using ContextType = typename OperationBase::ContextType; - explicit RemoveContextAdapter(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit RemoveContextAdapter(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} template - decltype(auto) load(ContextType, - ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) load(ContextType, ArgTypes &&...args) KLEIDICV_STREAMING { return OperationBase::load(std::forward(args)...); } template - decltype(auto) load_consecutive(ContextType, ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) load_consecutive(ContextType, + ArgTypes &&...args) KLEIDICV_STREAMING { return OperationBase::load_consecutive( std::forward(args)...); } template - decltype(auto) store(ContextType, - ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) store(ContextType, ArgTypes &&...args) KLEIDICV_STREAMING { return OperationBase::store(std::forward(args)...); } template - decltype(auto) store_consecutive(ContextType, ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) store_consecutive(ContextType, + ArgTypes &&...args) KLEIDICV_STREAMING { return OperationBase::store_consecutive( std::forward(args)...); } template decltype(auto) vector_path(ContextType, - ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + ArgTypes &&...args) KLEIDICV_STREAMING { return this->operation().vector_path(std::forward(args)...); } // Forwards remaining_path() calls to the inner operation. template - decltype(auto) remaining_path(ContextType, ArgTypes &&...args) - KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) remaining_path(ContextType, + ArgTypes &&...args) KLEIDICV_STREAMING { return this->operation().remaining_path(std::forward(args)...); } template - decltype(auto) tail_path(ContextType, - ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + decltype(auto) tail_path(ContextType, ArgTypes &&...args) KLEIDICV_STREAMING { return this->operation().tail_path(std::forward(args)...); } template decltype(auto) scalar_path(ContextType, - ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + ArgTypes &&...args) KLEIDICV_STREAMING { return this->operation().scalar_path(std::forward(args)...); } }; // end of class RemoveContextAdapter diff --git a/kleidicv/include/kleidicv/sve2.h b/kleidicv/include/kleidicv/sve2.h index e3fe8c69f28c470b89fcb55494937b14ecd40e54..8dd9d56e166771ab513235a23de424bcd55dc9c6 100644 --- a/kleidicv/include/kleidicv/sve2.h +++ b/kleidicv/include/kleidicv/sve2.h @@ -18,13 +18,13 @@ namespace KLEIDICV_TARGET_NAMESPACE { // Context associated with SVE operations. class Context { public: - explicit Context(svbool_t &pg) KLEIDICV_STREAMING_COMPATIBLE : pg_{pg} {} + explicit Context(svbool_t &pg) KLEIDICV_STREAMING : pg_{pg} {} // Sets the predicate associated with the context to a given predicate. - void set_predicate(svbool_t pg) KLEIDICV_STREAMING_COMPATIBLE { pg_ = pg; } + void set_predicate(svbool_t pg) KLEIDICV_STREAMING { pg_ = pg; } // Returns predicate associated with the context. - svbool_t predicate() const KLEIDICV_STREAMING_COMPATIBLE { return pg_; } + svbool_t predicate() const KLEIDICV_STREAMING { return pg_; } protected: // Hold a reference to an svbool_t because a sizeless type cannot be a member. @@ -140,162 +140,206 @@ template class VecTraitsBase : public VectorTypes { public: using typename VectorTypes::VectorType; + using typename VectorTypes::Vector2Type; // Number of lanes in a vector. - static inline size_t num_lanes() KLEIDICV_STREAMING_COMPATIBLE { + static inline size_t num_lanes() KLEIDICV_STREAMING { return static_cast(svcnt()); } // Maximum number of lanes in a vector. - static constexpr size_t max_num_lanes() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr size_t max_num_lanes() KLEIDICV_STREAMING { return 256 / sizeof(ScalarType); } // Loads a single vector from 'src'. static inline void load(Context ctx, const ScalarType *src, - VectorType &vec) KLEIDICV_STREAMING_COMPATIBLE { + VectorType &vec) KLEIDICV_STREAMING { vec = svld1(ctx.predicate(), &src[0]); } // Loads two consecutive vectors from 'src'. static inline void load_consecutive(Context ctx, const ScalarType *src, - VectorType &vec_0, VectorType &vec_1) - KLEIDICV_STREAMING_COMPATIBLE { + VectorType &vec_0, + VectorType &vec_1) KLEIDICV_STREAMING { +#if KLEIDICV_TARGET_SME2 + // Assuming that ctx contains a full predicate. + (void)ctx; + svcount_t p_counter = svptrue_c(); + Vector2Type v = svld1_x2(p_counter, &src[0]); + vec_0 = svget2(v, 0); + vec_1 = svget2(v, 1); +#else vec_0 = svld1(ctx.predicate(), &src[0]); vec_1 = svld1_vnum(ctx.predicate(), &src[0], 1); +#endif } // Stores a single vector to 'dst'. static inline void store(Context ctx, VectorType vec, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { svst1(ctx.predicate(), &dst[0], vec); } // Stores two consecutive vectors to 'dst'. static inline void store_consecutive(Context ctx, VectorType vec_0, - VectorType vec_1, ScalarType *dst) - KLEIDICV_STREAMING_COMPATIBLE { + VectorType vec_1, + ScalarType *dst) KLEIDICV_STREAMING { +#if KLEIDICV_TARGET_SME2 + // Assuming that ctx contains a full predicate. + (void)ctx; + svcount_t p_counter = svptrue_c(); + Vector2Type v = svcreate2(vec_0, vec_1); + svst1(p_counter, &dst[0], v); +#else svst1(ctx.predicate(), &dst[0], vec_0); svst1_vnum(ctx.predicate(), &dst[0], 1, vec_1); +#endif } template static std::enable_if_t svcnt() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svcntb(); } template static std::enable_if_t svcnt() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svcnth(); } template static std::enable_if_t svcnt() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svcntw(); } template static std::enable_if_t svcnt() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svcntd(); } template static std::enable_if_t svcntp( - svbool_t pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg) KLEIDICV_STREAMING { return svcntp_b8(pg, pg); } template static std::enable_if_t svcntp( - svbool_t pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg) KLEIDICV_STREAMING { return svcntp_b16(pg, pg); } template static std::enable_if_t svcntp( - svbool_t pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg) KLEIDICV_STREAMING { return svcntp_b32(pg, pg); } template static std::enable_if_t svcntp( - svbool_t pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg) KLEIDICV_STREAMING { return svcntp_b64(pg, pg); } template static std::enable_if_t svptrue() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_b8(); } template static std::enable_if_t svptrue() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_b16(); } template static std::enable_if_t svptrue() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_b32(); } template static std::enable_if_t svptrue() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_b64(); } +#if KLEIDICV_TARGET_SME2 + template + static std::enable_if_t svptrue_c() + KLEIDICV_STREAMING { + return svptrue_c8(); + } + + template + static std::enable_if_t svptrue_c() + KLEIDICV_STREAMING { + return svptrue_c16(); + } + + template + static std::enable_if_t svptrue_c() + KLEIDICV_STREAMING { + return svptrue_c32(); + } + + template + static std::enable_if_t svptrue_c() + KLEIDICV_STREAMING { + return svptrue_c64(); + } +#endif // KLEIDICV_TARGET_SME2 + template static std::enable_if_t svptrue_pat() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_pat_b8(pat); } template static std::enable_if_t svptrue_pat() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_pat_b16(pat); } template static std::enable_if_t svptrue_pat() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_pat_b32(pat); } template static std::enable_if_t svptrue_pat() - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { return svptrue_pat_b64(pat); } template static std::enable_if_t svwhilelt( - IndexType index, IndexType max_index) KLEIDICV_STREAMING_COMPATIBLE { + IndexType index, IndexType max_index) KLEIDICV_STREAMING { return svwhilelt_b8(index, max_index); } template static std::enable_if_t svwhilelt( - IndexType index, IndexType max_index) KLEIDICV_STREAMING_COMPATIBLE { + IndexType index, IndexType max_index) KLEIDICV_STREAMING { return svwhilelt_b16(index, max_index); } template static std::enable_if_t svwhilelt( - IndexType index, IndexType max_index) KLEIDICV_STREAMING_COMPATIBLE { + IndexType index, IndexType max_index) KLEIDICV_STREAMING { return svwhilelt_b32(index, max_index); } template static std::enable_if_t svwhilelt( - IndexType index, IndexType max_index) KLEIDICV_STREAMING_COMPATIBLE { + IndexType index, IndexType max_index) KLEIDICV_STREAMING { return svwhilelt_b64(index, max_index); } @@ -303,8 +347,8 @@ class VecTraitsBase : public VectorTypes { // used for consecutive operations. The input predicate can only have // consecutive ones starting at the lowest element. static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0, - svbool_t &pg_1, svbool_t &pg_2) - KLEIDICV_STREAMING_COMPATIBLE { + svbool_t &pg_1, + svbool_t &pg_2) KLEIDICV_STREAMING { // Length of data. Must be signed because of the unconditional subtraction // of fixed values. int64_t length = 3 * svcntp(pg); @@ -324,9 +368,9 @@ class VecTraitsBase : public VectorTypes { // Transforms a single predicate into four other predicates that then can be // used for consecutive operations. The input predicate can only have // consecutive ones starting at the lowest element. - static void make_consecutive_predicates( - svbool_t pg, svbool_t &pg_0, svbool_t &pg_1, svbool_t &pg_2, - svbool_t &pg_3) KLEIDICV_STREAMING_COMPATIBLE { + static void make_consecutive_predicates(svbool_t pg, svbool_t &pg_0, + svbool_t &pg_1, svbool_t &pg_2, + svbool_t &pg_3) KLEIDICV_STREAMING { // Length of data. Must be signed because of the unconditional subtraction // of fixed values. int64_t length = 4 * svcntp(pg); @@ -356,15 +400,14 @@ class VecTraits : public VecTraitsBase {}; template <> class VecTraits : public VecTraitsBase { public: - static inline svint8_t svdup(int8_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svint8_t svdup(int8_t v) KLEIDICV_STREAMING { return svdup_s8(v); } - static inline svint8_t svreinterpret(svuint8_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svint8_t svreinterpret(svuint8_t v) KLEIDICV_STREAMING { return svreinterpret_s8(v); } static inline svint8_t svasr_n(svbool_t pg, svint8_t v, - uint8_t s) KLEIDICV_STREAMING_COMPATIBLE { + uint8_t s) KLEIDICV_STREAMING { return svasr_n_s8_x(pg, v, s); } }; // end of class VecTraits @@ -372,19 +415,18 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svuint8_t svdup(uint8_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint8_t svdup(uint8_t v) KLEIDICV_STREAMING { return svdup_u8(v); } - static inline svuint8_t svreinterpret(svint8_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint8_t svreinterpret(svint8_t v) KLEIDICV_STREAMING { return svreinterpret_u8(v); } static inline svuint8_t svsub(svbool_t pg, svuint8_t v, - svuint8_t u) KLEIDICV_STREAMING_COMPATIBLE { + svuint8_t u) KLEIDICV_STREAMING { return svsub_u8_x(pg, v, u); } static inline svuint8_t svhsub(svbool_t pg, svuint8_t v, - svuint8_t u) KLEIDICV_STREAMING_COMPATIBLE { + svuint8_t u) KLEIDICV_STREAMING { return svhsub_u8_x(pg, v, u); } }; // end of class VecTraits @@ -392,11 +434,10 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svint16_t svdup(int16_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svint16_t svdup(int16_t v) KLEIDICV_STREAMING { return svdup_s16(v); } - static inline svint16_t svreinterpret(svuint16_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svint16_t svreinterpret(svuint16_t v) KLEIDICV_STREAMING { return svreinterpret_s16(v); } }; // end of class VecTraits @@ -404,11 +445,10 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svuint16_t svdup(uint16_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint16_t svdup(uint16_t v) KLEIDICV_STREAMING { return svdup_u16(v); } - static inline svuint16_t svreinterpret(svint16_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint16_t svreinterpret(svint16_t v) KLEIDICV_STREAMING { return svreinterpret_u16(v); } }; // end of class VecTraits @@ -416,11 +456,10 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svint32_t svdup(int32_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svint32_t svdup(int32_t v) KLEIDICV_STREAMING { return svdup_s32(v); } - static inline svint32_t svreinterpret(svuint32_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svint32_t svreinterpret(svuint32_t v) KLEIDICV_STREAMING { return svreinterpret_s32(v); } }; // end of class VecTraits @@ -428,11 +467,10 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svuint32_t svdup(uint32_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint32_t svdup(uint32_t v) KLEIDICV_STREAMING { return svdup_u32(v); } - static inline svuint32_t svreinterpret(svint32_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint32_t svreinterpret(svint32_t v) KLEIDICV_STREAMING { return svreinterpret_u32(v); } }; // end of class VecTraits @@ -440,11 +478,10 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svint64_t svdup(int64_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svint64_t svdup(int64_t v) KLEIDICV_STREAMING { return svdup_s64(v); } - static inline svint64_t svreinterpret(svuint64_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svint64_t svreinterpret(svuint64_t v) KLEIDICV_STREAMING { return svreinterpret_s64(v); } }; // end of class VecTraits @@ -452,11 +489,10 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svuint64_t svdup(uint64_t v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint64_t svdup(uint64_t v) KLEIDICV_STREAMING { return svdup_u64(v); } - static inline svuint64_t svreinterpret(svint64_t v) - KLEIDICV_STREAMING_COMPATIBLE { + static inline svuint64_t svreinterpret(svint64_t v) KLEIDICV_STREAMING { return svreinterpret_u64(v); } }; // end of class VecTraits @@ -464,11 +500,11 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svfloat32_t svdup(float v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svfloat32_t svdup(float v) KLEIDICV_STREAMING { return svdup_f32(v); } static inline svfloat32_t svsub(svbool_t pg, svfloat32_t v, - svfloat32_t u) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t u) KLEIDICV_STREAMING { return svsub_f32_x(pg, v, u); } }; // end of class VecTraits @@ -476,7 +512,7 @@ class VecTraits : public VecTraitsBase { template <> class VecTraits : public VecTraitsBase { public: - static inline svfloat64_t svdup(double v) KLEIDICV_STREAMING_COMPATIBLE { + static inline svfloat64_t svdup(double v) KLEIDICV_STREAMING { return svdup_f64(v); } }; // end of class VecTraits @@ -492,12 +528,12 @@ class OperationContextAdapter : public OperationBase { using ContextType = Context; using VecTraits = typename OperationBase::VecTraits; - explicit OperationContextAdapter(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit OperationContextAdapter(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} // Forwards vector_path_2x() calls to the inner operation. template - void vector_path_2x(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + void vector_path_2x(ArgTypes &&...args) KLEIDICV_STREAMING { svbool_t ctx_pg; ContextType ctx{ctx_pg}; ctx.set_predicate(VecTraits::svptrue()); @@ -506,7 +542,7 @@ class OperationContextAdapter : public OperationBase { // Forwards vector_path() calls to the inner operation. template - void vector_path(ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + void vector_path(ArgTypes &&...args) KLEIDICV_STREAMING { svbool_t ctx_pg; ContextType ctx{ctx_pg}; ctx.set_predicate(VecTraits::svptrue()); @@ -517,7 +553,7 @@ class OperationContextAdapter : public OperationBase { // operation is unrolled once. template std::enable_if_t remaining_path( - size_t length, ColumnTypes &&...columns) KLEIDICV_STREAMING_COMPATIBLE { + size_t length, ColumnTypes &&...columns) KLEIDICV_STREAMING { svbool_t ctx_pg; ContextType ctx{ctx_pg}; ctx.set_predicate(VecTraits::svwhilelt(size_t{0}, length)); @@ -528,7 +564,7 @@ class OperationContextAdapter : public OperationBase { // operation is not unrolled once. template std::enable_if_t remaining_path( - size_t length, ColumnTypes... columns) KLEIDICV_STREAMING_COMPATIBLE { + size_t length, ColumnTypes... columns) KLEIDICV_STREAMING { svbool_t ctx_pg; ContextType ctx{ctx_pg}; @@ -549,13 +585,13 @@ class RemainingPathAdapter : public OperationBase { public: using ContextType = Context; - explicit RemainingPathAdapter(OperationType &operation) - KLEIDICV_STREAMING_COMPATIBLE : OperationBase(operation) {} + explicit RemainingPathAdapter(OperationType &operation) KLEIDICV_STREAMING + : OperationBase(operation) {} // Forwards remaining_path() to either vector_path() or tail_path() of the // inner operation depending on what is requested by the innermost operation. template - void remaining_path(ArgTypes... args) KLEIDICV_STREAMING_COMPATIBLE { + void remaining_path(ArgTypes... args) KLEIDICV_STREAMING { if constexpr (OperationType::uses_tail_path()) { this->operation().tail_path(std::forward(args)...); } else { @@ -567,7 +603,7 @@ class RemainingPathAdapter : public OperationBase { // Shorthand for applying a generic unrolled SVE2 operation. template void apply_operation_by_rows(OperationType &operation, - ArgTypes &&...args) KLEIDICV_STREAMING_COMPATIBLE { + ArgTypes &&...args) KLEIDICV_STREAMING { ForwardingOperation forwarding_operation{operation}; OperationAdapter operation_adapter{forwarding_operation}; RemainingPathAdapter remaining_path_adapter{operation_adapter}; @@ -579,7 +615,7 @@ void apply_operation_by_rows(OperationType &operation, // Swap two variables, since some C++ Standard Library implementations do not // allow using std::swap for SVE vectors. template -static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING_COMPATIBLE { +static inline void swap_scalable(T &a, T &b) KLEIDICV_STREAMING { T tmp = a; a = b; b = tmp; @@ -591,7 +627,7 @@ template class ScalableVectorArray2D { public: std::reference_wrapper window[Rows][Cols]; - VectorType &operator()(int row, int col) KLEIDICV_STREAMING_COMPATIBLE { + VectorType &operator()(int row, int col) KLEIDICV_STREAMING { return window[row][col].get(); } }; diff --git a/kleidicv/include/kleidicv/traits.h b/kleidicv/include/kleidicv/traits.h index b0202f35955f3ba8ac4392bd9bf90dea7d7c7c66..112e4ec180a05a9810db0eb878f148f8e80c2250 100644 --- a/kleidicv/include/kleidicv/traits.h +++ b/kleidicv/include/kleidicv/traits.h @@ -19,8 +19,7 @@ class remove_streaming_compatible; #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2 template -class remove_streaming_compatible { +class remove_streaming_compatible { public: using type = Ret (Impl::*)(Args...); }; diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 51bb2e51e1d6dac9a9c305542a4eab2f3fbd76da..9b5c7ee7641fee9663e24fbe7688578365b956c9 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -13,10 +13,10 @@ namespace kleidicv { template -inline bool remap_s16_is_implemented( - size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, - kleidicv_border_type_t border_type, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE { +inline bool remap_s16_is_implemented(size_t src_stride, size_t src_width, + size_t src_height, size_t dst_width, + kleidicv_border_type_t border_type, + size_t channels) KLEIDICV_STREAMING { if constexpr (std::is_same::value || std::is_same::value) { return (src_stride / sizeof(T) <= std::numeric_limits::max() && @@ -32,10 +32,10 @@ inline bool remap_s16_is_implemented( } template -inline bool remap_s16point5_is_implemented( - size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, - kleidicv_border_type_t border_type, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE { +inline bool remap_s16point5_is_implemented(size_t src_stride, size_t src_width, + size_t src_height, size_t dst_width, + kleidicv_border_type_t border_type, + size_t channels) KLEIDICV_STREAMING { if constexpr (std::is_same::value || std::is_same::value) { return (src_stride / sizeof(T) <= @@ -55,7 +55,7 @@ template inline bool remap_f32_is_implemented( size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, size_t dst_height, kleidicv_border_type_t border_type, size_t channels, - kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING { if constexpr (std::is_same::value || std::is_same::value) { return (src_stride <= std::numeric_limits::max() && diff --git a/kleidicv/include/kleidicv/transform/warp_perspective.h b/kleidicv/include/kleidicv/transform/warp_perspective.h index cf7962afe6b289ffd3cdf927516fd5cfa31b5a8e..a88538cf2873b76beb4d5854cbd2d2c104ad908f 100644 --- a/kleidicv/include/kleidicv/transform/warp_perspective.h +++ b/kleidicv/include/kleidicv/transform/warp_perspective.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -30,7 +30,7 @@ template inline bool warp_perspective_is_implemented( size_t dst_width, size_t channels, kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_border_type_t border_type) KLEIDICV_STREAMING { if constexpr (std::is_same::value) { return (dst_width >= 8 && (interpolation == KLEIDICV_INTERPOLATION_NEAREST || diff --git a/kleidicv/include/kleidicv/types.h b/kleidicv/include/kleidicv/types.h index dc27824668266841ea3e134dff80370b7a3a505f..5bc65264e3ce7d646ad500c12cd8307843438b57 100644 --- a/kleidicv/include/kleidicv/types.h +++ b/kleidicv/include/kleidicv/types.h @@ -22,11 +22,10 @@ namespace KLEIDICV_TARGET_NAMESPACE { // Represents a point on a 2D plane. class Point final { public: - explicit Point(size_t x, size_t y) KLEIDICV_STREAMING_COMPATIBLE : x_{x}, - y_{y} {} + explicit Point(size_t x, size_t y) KLEIDICV_STREAMING : x_{x}, y_{y} {} - size_t x() const KLEIDICV_STREAMING_COMPATIBLE { return x_; } - size_t y() const KLEIDICV_STREAMING_COMPATIBLE { return y_; } + size_t x() const KLEIDICV_STREAMING { return x_; } + size_t y() const KLEIDICV_STREAMING { return y_; } private: size_t x_; @@ -36,32 +35,30 @@ class Point final { // Represents an area given by its width and height. class Rectangle final { public: - explicit Rectangle(size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE + explicit Rectangle(size_t width, size_t height) KLEIDICV_STREAMING : width_(width), height_(height) {} - explicit Rectangle(int width, int height) KLEIDICV_STREAMING_COMPATIBLE + explicit Rectangle(int width, int height) KLEIDICV_STREAMING : Rectangle(static_cast(width), static_cast(height)) {} - explicit Rectangle(kleidicv_rectangle_t rect) KLEIDICV_STREAMING_COMPATIBLE + explicit Rectangle(kleidicv_rectangle_t rect) KLEIDICV_STREAMING : Rectangle(rect.width, rect.height) {} - size_t width() const KLEIDICV_STREAMING_COMPATIBLE { return width_; } - size_t height() const KLEIDICV_STREAMING_COMPATIBLE { return height_; } - size_t area() const KLEIDICV_STREAMING_COMPATIBLE { - return width() * height(); - } + size_t width() const KLEIDICV_STREAMING { return width_; } + size_t height() const KLEIDICV_STREAMING { return height_; } + size_t area() const KLEIDICV_STREAMING { return width() * height(); } - void flatten() KLEIDICV_STREAMING_COMPATIBLE { + void flatten() KLEIDICV_STREAMING { width_ = area(); height_ = 1; } - bool operator==(const Rectangle &rhs) const KLEIDICV_STREAMING_COMPATIBLE { + bool operator==(const Rectangle &rhs) const KLEIDICV_STREAMING { return width() == rhs.width() && height() == rhs.height(); } - bool operator!=(const Rectangle &rhs) const KLEIDICV_STREAMING_COMPATIBLE { + bool operator!=(const Rectangle &rhs) const KLEIDICV_STREAMING { return !operator==(rhs); } @@ -74,31 +71,31 @@ class Rectangle final { class Margin final { public: explicit constexpr Margin(size_t left, size_t top, size_t right, - size_t bottom) KLEIDICV_STREAMING_COMPATIBLE + size_t bottom) KLEIDICV_STREAMING : left_(left), top_(top), right_(right), bottom_(bottom) {} - explicit constexpr Margin(size_t margin) KLEIDICV_STREAMING_COMPATIBLE + explicit constexpr Margin(size_t margin) KLEIDICV_STREAMING : left_(margin), top_(margin), right_(margin), bottom_(margin) {} explicit Margin(kleidicv_rectangle_t kernel, - kleidicv_point_t anchor) KLEIDICV_STREAMING_COMPATIBLE + kleidicv_point_t anchor) KLEIDICV_STREAMING : Margin(anchor.x, anchor.y, kernel.width - anchor.x - 1, kernel.height - anchor.y - 1) {} - explicit Margin(Rectangle kernel, Point anchor) KLEIDICV_STREAMING_COMPATIBLE + explicit Margin(Rectangle kernel, Point anchor) KLEIDICV_STREAMING : Margin(anchor.x(), anchor.y(), kernel.width() - anchor.x() - 1, kernel.height() - anchor.y() - 1) {} - size_t left() const KLEIDICV_STREAMING_COMPATIBLE { return left_; } - size_t top() const KLEIDICV_STREAMING_COMPATIBLE { return top_; } - size_t right() const KLEIDICV_STREAMING_COMPATIBLE { return right_; } - size_t bottom() const KLEIDICV_STREAMING_COMPATIBLE { return bottom_; } + size_t left() const KLEIDICV_STREAMING { return left_; } + size_t top() const KLEIDICV_STREAMING { return top_; } + size_t right() const KLEIDICV_STREAMING { return right_; } + size_t bottom() const KLEIDICV_STREAMING { return bottom_; } private: size_t left_; @@ -111,52 +108,49 @@ class Margin final { template class Columns final { public: - explicit Columns(T *ptr, size_t channels) KLEIDICV_STREAMING_COMPATIBLE + explicit Columns(T *ptr, size_t channels) KLEIDICV_STREAMING : ptr_{ptr}, channels_{channels} {} // Subscript operator to return an arbitrary column at an index. To account // for channel count use at() method. - T &operator[](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { - return ptr_[index]; - } + T &operator[](ptrdiff_t index) KLEIDICV_STREAMING { return ptr_[index]; } // Addition assignment operator to step across the columns. - Columns &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + Columns &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING { ptr_ += static_cast(channels()) * diff; return *this; } // Subtraction assignment operator to step across the columns. - Columns &operator-=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + Columns &operator-=(ptrdiff_t diff) KLEIDICV_STREAMING { ptr_ -= static_cast(channels()) * diff; return *this; } // Prefix increment operator to advance to the next column. - Columns &operator++() KLEIDICV_STREAMING_COMPATIBLE { return operator+=(1); } + Columns &operator++() KLEIDICV_STREAMING { return operator+=(1); } // NOLINTBEGIN(hicpp-explicit-conversions) // Implicit conversion operator from Columns to Columns. - [[nodiscard]] operator Columns() const - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] operator Columns() const KLEIDICV_STREAMING { return Columns{ptr_, channels()}; } // NOLINTEND(hicpp-explicit-conversions) // Returns a new instance at a given column. - [[nodiscard]] Columns at(ptrdiff_t column) KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] Columns at(ptrdiff_t column) KLEIDICV_STREAMING { return Columns{&ptr_[column * static_cast(channels())], channels()}; } // Returns a pointer to a given column. - [[nodiscard]] T *ptr_at(ptrdiff_t column) KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] T *ptr_at(ptrdiff_t column) KLEIDICV_STREAMING { return ptr_ + column * static_cast(channels()); } // Returns the number of channels in a row. - size_t channels() const KLEIDICV_STREAMING_COMPATIBLE { return channels_; } + size_t channels() const KLEIDICV_STREAMING { return channels_; } private: // Pointer to the current position. @@ -170,33 +164,31 @@ template class ParallelColumns final { public: explicit ParallelColumns(Columns columns_0, - Columns columns_1) KLEIDICV_STREAMING_COMPATIBLE + Columns columns_1) KLEIDICV_STREAMING : columns_{columns_0, columns_1} {} // Addition assignment operator to step across the columns. - ParallelColumns &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + ParallelColumns &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING { columns_[0] += diff; columns_[1] += diff; return *this; } // Subtraction assignment operator to navigate among rows. - ParallelColumns &operator-=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + ParallelColumns &operator-=(ptrdiff_t diff) KLEIDICV_STREAMING { return operator+=(-1 * diff); } // Prefix increment operator to advance to the next column. - ParallelColumns &operator++() KLEIDICV_STREAMING_COMPATIBLE { - return operator+=(1); - } + ParallelColumns &operator++() KLEIDICV_STREAMING { return operator+=(1); } // Returns the columns belonging to the first row. - [[nodiscard]] Columns first() const KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] Columns first() const KLEIDICV_STREAMING { return columns_[0]; } // Returns the columns belonging to the second row. - [[nodiscard]] Columns second() const KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] Columns second() const KLEIDICV_STREAMING { return columns_[1]; } @@ -210,40 +202,38 @@ template class RowBase { public: // Returns the distance in bytes between two consecutive rows. - size_t stride() const KLEIDICV_STREAMING_COMPATIBLE { return stride_; } + size_t stride() const KLEIDICV_STREAMING { return stride_; } // Returns the number of channels in a row. - size_t channels() const KLEIDICV_STREAMING_COMPATIBLE { return channels_; } + size_t channels() const KLEIDICV_STREAMING { return channels_; } // Returns true if rows are continuous for a given length, otherwise false. - bool is_continuous(size_t length) const KLEIDICV_STREAMING_COMPATIBLE { + bool is_continuous(size_t length) const KLEIDICV_STREAMING { return stride() == (length * channels() * sizeof(T)); } // When handling multiple rows this switches to a single row in an // implementation defined way. - void make_single_row() const KLEIDICV_STREAMING_COMPATIBLE {} + void make_single_row() const KLEIDICV_STREAMING {} // Returns false if is_continuous() always returns false, otherwise true. - static constexpr bool maybe_continuous() KLEIDICV_STREAMING_COMPATIBLE { - return true; - } + static constexpr bool maybe_continuous() KLEIDICV_STREAMING { return true; } protected: // TODO: default initialise members. // NOLINTBEGIN(hicpp-member-init) // The default constructor creates an uninitialized instance. - RowBase() KLEIDICV_STREAMING_COMPATIBLE = default; + RowBase() KLEIDICV_STREAMING = default; // NOLINTEND(hicpp-member-init) - RowBase(size_t stride, size_t channels) KLEIDICV_STREAMING_COMPATIBLE + RowBase(size_t stride, size_t channels) KLEIDICV_STREAMING : stride_(stride), channels_(channels) {} // Adds a stride to a pointer, and returns the new pointer. template - [[nodiscard]] static P *add_stride(P *ptr, ptrdiff_t stride) - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] static P *add_stride(P *ptr, + ptrdiff_t stride) KLEIDICV_STREAMING { uintptr_t intptr = reinterpret_cast(ptr); intptr += stride; // NOLINTBEGIN(performance-no-int-to-ptr) @@ -253,8 +243,8 @@ class RowBase { // Subtracts a stride to a pointer, and returns the new pointer. template - [[nodiscard]] static P *subtract_stride(P *ptr, ptrdiff_t stride) - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] static P *subtract_stride(P *ptr, + ptrdiff_t stride) KLEIDICV_STREAMING { uintptr_t intptr = reinterpret_cast(ptr); intptr -= stride; // NOLINTBEGIN(performance-no-int-to-ptr) @@ -279,55 +269,52 @@ class Rows final : public RowBase { using RowBase::stride; // The default constructor creates an uninitialized instance. - Rows() KLEIDICV_STREAMING_COMPATIBLE : RowBase() {} + Rows() KLEIDICV_STREAMING : RowBase() {} - explicit Rows(T *ptr, size_t stride, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE + explicit Rows(T *ptr, size_t stride, size_t channels) KLEIDICV_STREAMING : RowBase(stride, channels), ptr_{ptr} {} - explicit Rows(T *ptr, size_t stride) KLEIDICV_STREAMING_COMPATIBLE + explicit Rows(T *ptr, size_t stride) KLEIDICV_STREAMING : Rows(ptr, stride, 1) {} - explicit Rows(T *ptr) KLEIDICV_STREAMING_COMPATIBLE : Rows(ptr, 0, 0) {} + explicit Rows(T *ptr) KLEIDICV_STREAMING : Rows(ptr, 0, 0) {} // Subscript operator to return an arbitrary position within the current row. // To account for stride and channel count use at() method. - T &operator[](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { - return ptr_[index]; - } + T &operator[](ptrdiff_t index) KLEIDICV_STREAMING { return ptr_[index]; } // Addition assignment operator to navigate among rows. - Rows &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + Rows &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING { ptr_ = get_pointer_at(diff); return *this; } // Prefix increment operator to advance to the next row. - Rows &operator++() KLEIDICV_STREAMING_COMPATIBLE { return operator+=(1); } + Rows &operator++() KLEIDICV_STREAMING { return operator+=(1); } // NOLINTBEGIN(hicpp-explicit-conversions) // Returns a const variant of this instance. - [[nodiscard]] operator Rows() KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] operator Rows() KLEIDICV_STREAMING { return Rows{ptr_, stride(), channels()}; } // NOLINTEND(hicpp-explicit-conversions) // Returns a new instance at a given row and column. [[nodiscard]] Rows at(ptrdiff_t row, - ptrdiff_t column = 0) KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t column = 0) KLEIDICV_STREAMING { return Rows{get_pointer_at(row, column), stride(), channels()}; } // Returns a view on columns within the current row. - [[nodiscard]] Columns as_columns() const KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] Columns as_columns() const KLEIDICV_STREAMING { return Columns{ptr_, channels()}; } // Translates a logical one-dimensional element index into physical byte // offset for that element with a given row width. - [[nodiscard]] size_t offset_for_index(size_t index, size_t width) const - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] size_t offset_for_index(size_t index, + size_t width) const KLEIDICV_STREAMING { size_t row = index / width; size_t column = index % width; return row * stride() + column * sizeof(T); @@ -336,8 +323,8 @@ class Rows final : public RowBase { private: // Returns a column in a row at a given index taking stride and channels into // account. - [[nodiscard]] T *get_pointer_at(ptrdiff_t row, ptrdiff_t column = 0) - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] T *get_pointer_at(ptrdiff_t row, + ptrdiff_t column = 0) KLEIDICV_STREAMING { T *ptr = RowBase::add_stride(ptr_, row * static_cast(stride())); return &ptr[column * static_cast(channels())]; @@ -356,15 +343,15 @@ class IndirectRows : public RowBase { using RowBase::stride; // The default constructor creates an uninitialized instance. - IndirectRows() KLEIDICV_STREAMING_COMPATIBLE : RowBase() {} + IndirectRows() KLEIDICV_STREAMING : RowBase() {} explicit IndirectRows(T **ptr_storage, size_t stride, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE + size_t channels) KLEIDICV_STREAMING : RowBase(stride, channels), ptr_storage_(ptr_storage) {} explicit IndirectRows(T **ptr_storage, size_t depth, - Rows rows) KLEIDICV_STREAMING_COMPATIBLE + Rows rows) KLEIDICV_STREAMING : RowBase(rows.stride(), rows.channels()), ptr_storage_(ptr_storage) { for (size_t index = 0; index < depth; ++index) { @@ -374,30 +361,30 @@ class IndirectRows : public RowBase { // Subscript operator to return a position within the current row. To account // for stride and channel count use at() method. - T &operator[](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + T &operator[](ptrdiff_t index) KLEIDICV_STREAMING { return ptr_storage_[0][index]; } // Addition assignment operator to navigate among rows. - IndirectRows &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + IndirectRows &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING { ptr_storage_ += diff; return *this; } // Prefix increment operator to advance to the next row. - IndirectRows &operator++() KLEIDICV_STREAMING_COMPATIBLE { + IndirectRows &operator++() KLEIDICV_STREAMING { return this->operator+=(1); } // Returns a new instance at a given row and column. [[nodiscard]] Rows at(ptrdiff_t row, - ptrdiff_t column = 0) KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t column = 0) KLEIDICV_STREAMING { auto rows = Rows{ptr_storage_[row], stride(), channels()}; return rows.at(0, column); } // Returns a view on columns within the current row. - [[nodiscard]] Columns as_columns() const KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] Columns as_columns() const KLEIDICV_STREAMING { return Columns{ptr_storage_[0], channels()}; } @@ -415,8 +402,8 @@ class DoubleBufferedIndirectRows final : public IndirectRows { using IndirectRows::channels; using IndirectRows::stride; - explicit DoubleBufferedIndirectRows( - T **ptr_storage, size_t depth, Rows rows) KLEIDICV_STREAMING_COMPATIBLE + explicit DoubleBufferedIndirectRows(T **ptr_storage, size_t depth, + Rows rows) KLEIDICV_STREAMING : IndirectRows(ptr_storage, 2 * depth, rows) { // Fill the second half of the pointer storage. for (size_t index = 0; index < 2 * depth; ++index) { @@ -428,17 +415,17 @@ class DoubleBufferedIndirectRows final : public IndirectRows { } // Swaps the double buffered indirect rows. - void swap() KLEIDICV_STREAMING_COMPATIBLE { + void swap() KLEIDICV_STREAMING { std::swap(db_ptr_storage_[0], db_ptr_storage_[1]); } // Returns indirect rows where write is allowed. - [[nodiscard]] IndirectRows write_at() KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] IndirectRows write_at() KLEIDICV_STREAMING { return IndirectRows{db_ptr_storage_[0], stride(), channels()}; } // Returns indirect rows where read is allowed. - [[nodiscard]] IndirectRows read_at() KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] IndirectRows read_at() KLEIDICV_STREAMING { return IndirectRows{db_ptr_storage_[1], stride(), channels()}; } @@ -456,35 +443,32 @@ class ParallelRows final : public RowBase { using RowBase::stride; explicit ParallelRows(T *ptr, size_t stride, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE + size_t channels) KLEIDICV_STREAMING : RowBase(2 * stride, channels), ptrs_{ptr, RowBase::add_stride(ptr, stride)} {} - explicit ParallelRows(T *ptr, size_t stride) KLEIDICV_STREAMING_COMPATIBLE + explicit ParallelRows(T *ptr, size_t stride) KLEIDICV_STREAMING : ParallelRows(ptr, stride, 1) {} // Addition assignment operator to navigate among rows. - ParallelRows &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING_COMPATIBLE { + ParallelRows &operator+=(ptrdiff_t diff) KLEIDICV_STREAMING { ptrs_[0] = RowBase::add_stride(ptrs_[0], diff * stride()); ptrs_[1] = RowBase::add_stride(ptrs_[1], diff * stride()); return *this; } // Prefix increment operator to advance to the next row. - ParallelRows &operator++() KLEIDICV_STREAMING_COMPATIBLE { - return operator+=(1); - } + ParallelRows &operator++() KLEIDICV_STREAMING { return operator+=(1); } // Returns views on columns within the current rows. - [[nodiscard]] ParallelColumns as_columns() const - KLEIDICV_STREAMING_COMPATIBLE { + [[nodiscard]] ParallelColumns as_columns() const KLEIDICV_STREAMING { Columns columns_0{ptrs_[0], channels()}; Columns columns_1{ptrs_[1], channels()}; return ParallelColumns{columns_0, columns_1}; } // Instructs the logic to drop the second row. - void make_single_row() KLEIDICV_STREAMING_COMPATIBLE { ptrs_[1] = ptrs_[0]; } + void make_single_row() KLEIDICV_STREAMING { ptrs_[1] = ptrs_[0]; } private: // Pointers to the two parallel rows. @@ -493,7 +477,7 @@ class ParallelRows final : public RowBase { template void zip_rows(OperationType &operation, Rectangle rect, - RowTypes... rows) KLEIDICV_STREAMING_COMPATIBLE { + RowTypes... rows) KLEIDICV_STREAMING { // Unary left fold. Evaluates the expression for every part of the unexpanded // parameter pack 'rows'. if ((... && (rows.is_continuous(rect.width())))) { @@ -509,7 +493,7 @@ void zip_rows(OperationType &operation, Rectangle rect, template void zip_parallel_rows(OperationType &operation, Rectangle rect, - RowTypes... rows) KLEIDICV_STREAMING_COMPATIBLE { + RowTypes... rows) KLEIDICV_STREAMING { for (size_t row_index = 0; row_index < rect.height(); row_index += 2) { // Handle the last odd row in a special way. if (KLEIDICV_UNLIKELY(row_index == (rect.height() - 1))) { @@ -527,7 +511,7 @@ template class CopyRows final { public: void process_row(size_t length, Columns src, - Columns dst) KLEIDICV_STREAMING_COMPATIBLE { + Columns dst) KLEIDICV_STREAMING { #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2 __arm_sc_memmove(static_cast(&dst[0]), static_cast(&src[0]), @@ -540,8 +524,7 @@ class CopyRows final { } template - static void copy_rows(Rectangle rect, S src, - D dst) KLEIDICV_STREAMING_COMPATIBLE { + static void copy_rows(Rectangle rect, S src, D dst) KLEIDICV_STREAMING { CopyRows operation; zip_rows(operation, rect, src, dst); } @@ -552,7 +535,7 @@ template class CopyNonOverlappingRows final { public: void process_row(size_t length, Columns src, - Columns dst) KLEIDICV_STREAMING_COMPATIBLE { + Columns dst) KLEIDICV_STREAMING { #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2 __arm_sc_memcpy(static_cast(&dst[0]), static_cast(&src[0]), @@ -565,7 +548,7 @@ class CopyNonOverlappingRows final { } static void copy_rows(Rectangle rect, Rows src, - Rows dst) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst) KLEIDICV_STREAMING { CopyNonOverlappingRows operation; zip_rows(operation, rect, src, dst); } diff --git a/kleidicv/include/kleidicv/utils.h b/kleidicv/include/kleidicv/utils.h index e95ce94ea658c17696cd4945e3ba3e96f9d4801e..ab4f4d9d35b5deeb5eaf4e25d98e1adfab5f3947 100644 --- a/kleidicv/include/kleidicv/utils.h +++ b/kleidicv/include/kleidicv/utils.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -20,7 +20,7 @@ namespace KLEIDICV_TARGET_NAMESPACE { template ::is_signed, bool> = true, std::enable_if_t::is_signed, bool> = true> -static U saturating_cast(S value) KLEIDICV_STREAMING_COMPATIBLE { +static U saturating_cast(S value) KLEIDICV_STREAMING { if (value > std::numeric_limits::max()) { return std::numeric_limits::max(); } @@ -36,7 +36,7 @@ template < typename SrcType, typename DstType, std::enable_if_t && std::is_unsigned_v, bool> = true> -static DstType saturating_cast(SrcType value) KLEIDICV_STREAMING_COMPATIBLE { +static DstType saturating_cast(SrcType value) KLEIDICV_STREAMING { return static_cast(value); } @@ -45,7 +45,7 @@ template < typename SrcType, typename DstType, std::enable_if_t && std::is_unsigned_v, bool> = true> -static DstType saturating_cast(SrcType value) KLEIDICV_STREAMING_COMPATIBLE { +static DstType saturating_cast(SrcType value) KLEIDICV_STREAMING { DstType max_value = std::numeric_limits::max(); if (value > static_cast(max_value)) { @@ -57,20 +57,19 @@ static DstType saturating_cast(SrcType value) KLEIDICV_STREAMING_COMPATIBLE { // Rounding shift right. template -static T rounding_shift_right(T value, - size_t shift) KLEIDICV_STREAMING_COMPATIBLE { +static T rounding_shift_right(T value, size_t shift) KLEIDICV_STREAMING { return (value + (1UL << (shift - 1))) >> shift; } // When placed in a loop, it effectively disables loop vectorization. -static inline void disable_loop_vectorization() KLEIDICV_STREAMING_COMPATIBLE { +static inline void disable_loop_vectorization() KLEIDICV_STREAMING { __asm__(""); } // Helper class to unroll a loop as needed. class LoopUnroll final { public: - explicit LoopUnroll(size_t length, size_t step) KLEIDICV_STREAMING_COMPATIBLE + explicit LoopUnroll(size_t length, size_t step) KLEIDICV_STREAMING : length_(length), step_(step), index_(0), @@ -78,22 +77,20 @@ class LoopUnroll final { // Loop unrolled four times. template - LoopUnroll &unroll_four_times(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll &unroll_four_times(CallbackType callback) KLEIDICV_STREAMING { return unroll_n_times<4>(callback); } // Loop unrolled twice. template - LoopUnroll &unroll_twice(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll &unroll_twice(CallbackType callback) KLEIDICV_STREAMING { return unroll_n_times<2>(callback); } // Unrolls the loop twice, if enabled. template LoopUnroll &unroll_twice_if([[maybe_unused]] CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if constexpr (Enable) { return unroll_twice(callback); } @@ -103,14 +100,14 @@ class LoopUnroll final { // Loop unrolled once. template - LoopUnroll &unroll_once(CallbackType callback) KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll &unroll_once(CallbackType callback) KLEIDICV_STREAMING { return unroll_n_times<1>(callback); } // Unrolls the loop once, if enabled. template LoopUnroll &unroll_once_if([[maybe_unused]] CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if constexpr (Enable) { return unroll_once(callback); } @@ -120,7 +117,7 @@ class LoopUnroll final { // Processes trailing data. template - LoopUnroll &tail(CallbackType callback) KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll &tail(CallbackType callback) KLEIDICV_STREAMING { for (index_ = 0; index_ < remaining_length(); ++index_) { disable_loop_vectorization(); callback(index_); @@ -132,7 +129,7 @@ class LoopUnroll final { // Processes all remaining data at once. template - LoopUnroll &remaining(CallbackType callback) KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll &remaining(CallbackType callback) KLEIDICV_STREAMING { if (length_) { callback(length_, step_); length_ = 0; @@ -142,27 +139,22 @@ class LoopUnroll final { } // Returns true if there is nothing left to process. - bool empty() const KLEIDICV_STREAMING_COMPATIBLE { return length_ == 0; } + bool empty() const KLEIDICV_STREAMING { return length_ == 0; } // Returns the step value. - size_t step() const KLEIDICV_STREAMING_COMPATIBLE { return step_; } + size_t step() const KLEIDICV_STREAMING { return step_; } // Returns the remaining length. - size_t remaining_length() const KLEIDICV_STREAMING_COMPATIBLE { - return length_; - } + size_t remaining_length() const KLEIDICV_STREAMING { return length_; } // Returns true if it is possible to avoid the tail loop. - bool can_avoid_tail() const KLEIDICV_STREAMING_COMPATIBLE { - return can_avoid_tail_; - } + bool can_avoid_tail() const KLEIDICV_STREAMING { return can_avoid_tail_; } // Instructs the loop logic to prepare to avoid the tail loop. - void avoid_tail() KLEIDICV_STREAMING_COMPATIBLE { length_ = step(); } + void avoid_tail() KLEIDICV_STREAMING { length_ = step(); } template - LoopUnroll &unroll_n_times(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll &unroll_n_times(CallbackType callback) KLEIDICV_STREAMING { const size_t step = UnrollFactor * step_; // In practice step will never be zero and we don't want to spend // instructions on checking that. @@ -181,8 +173,7 @@ class LoopUnroll final { // Instructs the loop logic to avoid the tail loop. template - bool try_avoid_tail_loop(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + bool try_avoid_tail_loop(CallbackType callback) KLEIDICV_STREAMING { if (KLEIDICV_UNLIKELY(!can_avoid_tail_)) { return false; } @@ -207,35 +198,32 @@ class LoopUnroll final { template class LoopUnroll2 final { public: - explicit LoopUnroll2(size_t length, size_t step) KLEIDICV_STREAMING_COMPATIBLE + explicit LoopUnroll2(size_t length, size_t step) KLEIDICV_STREAMING : length_(length), step_(step), index_(0) {} explicit LoopUnroll2(size_t start_index, size_t length, - size_t step) KLEIDICV_STREAMING_COMPATIBLE + size_t step) KLEIDICV_STREAMING : length_(length), step_(step), index_(std::min(start_index, length)) {} // Loop unrolled four times. template - LoopUnroll2 &unroll_four_times(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &unroll_four_times(CallbackType callback) KLEIDICV_STREAMING { return unroll_n_times<4>(callback); } // Loop unrolled twice. template - LoopUnroll2 &unroll_twice(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &unroll_twice(CallbackType callback) KLEIDICV_STREAMING { return unroll_n_times<2>(callback); } // Unrolls the loop twice, if enabled. template - LoopUnroll2 &unroll_twice_if(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &unroll_twice_if(CallbackType callback) KLEIDICV_STREAMING { if constexpr (Enable) { return unroll_twice(callback); } @@ -245,15 +233,13 @@ class LoopUnroll2 final { // Loop unrolled once. template - LoopUnroll2 &unroll_once(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &unroll_once(CallbackType callback) KLEIDICV_STREAMING { return unroll_n_times<1>(callback); } // Unrolls the loop once, if enabled. template - LoopUnroll2 &unroll_once_if(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &unroll_once_if(CallbackType callback) KLEIDICV_STREAMING { if constexpr (Enable) { return unroll_once(callback); } @@ -263,7 +249,7 @@ class LoopUnroll2 final { // Processes trailing data. template - LoopUnroll2 &tail(CallbackType callback) KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &tail(CallbackType callback) KLEIDICV_STREAMING { while (index_ < length_) { disable_loop_vectorization(); callback(index_++); @@ -274,7 +260,7 @@ class LoopUnroll2 final { // Processes all remaining data at once. template - LoopUnroll2 &remaining(CallbackType callback) KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &remaining(CallbackType callback) KLEIDICV_STREAMING { if (remaining_length()) { callback(index_, length_); index_ = length_; @@ -284,20 +270,19 @@ class LoopUnroll2 final { } // Returns true if there is nothing left to process. - bool empty() const KLEIDICV_STREAMING_COMPATIBLE { return length_ == index_; } + bool empty() const KLEIDICV_STREAMING { return length_ == index_; } // Returns the step value. - size_t step() const KLEIDICV_STREAMING_COMPATIBLE { return step_; } + size_t step() const KLEIDICV_STREAMING { return step_; } // Returns the remaining length. - size_t remaining_length() const KLEIDICV_STREAMING_COMPATIBLE { + size_t remaining_length() const KLEIDICV_STREAMING { return length_ - index_; } private: template - LoopUnroll2 &unroll_n_times(CallbackType callback) - KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 &unroll_n_times(CallbackType callback) KLEIDICV_STREAMING { const size_t n_step = UnrollFactor * step(); size_t max_index = index_ + (remaining_length() / n_step) * n_step; @@ -341,7 +326,7 @@ class LoopUnroll2 final { // Check whether any of the arguments are null pointers. template -bool any_null(Pointers... pointers) KLEIDICV_STREAMING_COMPATIBLE { +bool any_null(Pointers... pointers) KLEIDICV_STREAMING { return (... || (pointers == nullptr)); } @@ -353,7 +338,7 @@ bool any_null(Pointers... pointers) KLEIDICV_STREAMING_COMPATIBLE { } while (false) template -bool is_misaligned(Value v) KLEIDICV_STREAMING_COMPATIBLE { +bool is_misaligned(Value v) KLEIDICV_STREAMING { constexpr size_t kMask = alignof(AlignType) - 1; static_assert(kMask == 0b0001 || kMask == 0b0011 || kMask == 0b0111 || kMask == 0b1111); @@ -363,12 +348,12 @@ bool is_misaligned(Value v) KLEIDICV_STREAMING_COMPATIBLE { // Return value aligned up to the next multiple of alignment // Assumes alignment is a power of two. template -T align_up(T value, size_t alignment) KLEIDICV_STREAMING_COMPATIBLE { +T align_up(T value, size_t alignment) KLEIDICV_STREAMING { return (value + alignment - 1) & ~(alignment - 1); } template -T *align_up(T *value, size_t alignment) KLEIDICV_STREAMING_COMPATIBLE { +T *align_up(T *value, size_t alignment) KLEIDICV_STREAMING { // NOLINTBEGIN(performance-no-int-to-ptr) return reinterpret_cast( align_up(reinterpret_cast(value), alignment)); @@ -378,7 +363,7 @@ T *align_up(T *value, size_t alignment) KLEIDICV_STREAMING_COMPATIBLE { // Specialisation for when stride misalignment is possible. template std::enable_if_t check_pointer_and_stride( - T *pointer, size_t stride, size_t height) KLEIDICV_STREAMING_COMPATIBLE { + T *pointer, size_t stride, size_t height) KLEIDICV_STREAMING { if (pointer == nullptr) { return KLEIDICV_ERROR_NULL_POINTER; } @@ -391,8 +376,7 @@ std::enable_if_t check_pointer_and_stride( // Specialisation for when stride misalignment is impossible. template std::enable_if_t check_pointer_and_stride( - T *pointer, size_t /*stride*/, - size_t /*height*/) KLEIDICV_STREAMING_COMPATIBLE { + T *pointer, size_t /*stride*/, size_t /*height*/) KLEIDICV_STREAMING { if (pointer == nullptr) { return KLEIDICV_ERROR_NULL_POINTER; } diff --git a/kleidicv/include/kleidicv/workspace/blur_and_downsample_ws.h b/kleidicv/include/kleidicv/workspace/blur_and_downsample_ws.h index cbdb06713a6dc71430c16815118074ea4aac9d12..9b29cf4f0f8daa54d092cbb281bd94a0822fd11f 100644 --- a/kleidicv/include/kleidicv/workspace/blur_and_downsample_ws.h +++ b/kleidicv/include/kleidicv/workspace/blur_and_downsample_ws.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -18,7 +18,7 @@ class BlurAndDownsampleFilterWorkspace final : public SeparableFilterWorkspace { Rows src_rows, Rows dst_rows, size_t channels, typename FilterType::BorderType border_type, - FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + FilterType filter) KLEIDICV_STREAMING { // Border helper which calculates border offsets. typename FilterType::BorderInfoType vertical_border{rect.height(), border_type}; @@ -52,7 +52,7 @@ class BlurAndDownsampleFilterWorkspace final : public SeparableFilterWorkspace { Rows dst_rows, FilterType filter, typename FilterType::BorderInfoType horizontal_border) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { // Margin associated with the filter. constexpr size_t margin = filter.margin; diff --git a/kleidicv/include/kleidicv/workspace/border_15x15.h b/kleidicv/include/kleidicv/workspace/border_15x15.h index c281383abae58570d2f2cd7ebde645f26c520767..b9edc1b016e337c1ce5826808e6804a54295195c 100644 --- a/kleidicv/include/kleidicv/workspace/border_15x15.h +++ b/kleidicv/include/kleidicv/workspace/border_15x15.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -56,14 +56,14 @@ class FixedBorderInfo final { : height_(height), border_type_(border_type) {} // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets_without_border() const KLEIDICV_STREAMING { return get(-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7); } // NOLINTBEGIN(readability-function-cognitive-complexity) // Returns offsets for columns affected by left border. Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == 0) { @@ -152,7 +152,7 @@ class FixedBorderInfo final { // Returns offsets for columns affected by right border. Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == (height_ - 7)) { @@ -242,7 +242,7 @@ class FixedBorderInfo final { // Returns offsets for rows or columns affected by any border. Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if (row_or_column_index <= 6U) { // Rows and columns have the same offsets. return offsets_with_left_border(row_or_column_index); @@ -260,7 +260,7 @@ class FixedBorderInfo final { ptrdiff_t o4, ptrdiff_t o5, ptrdiff_t o6, ptrdiff_t o7, ptrdiff_t o8, ptrdiff_t o9, ptrdiff_t o10, ptrdiff_t o11, ptrdiff_t o12, ptrdiff_t o13, - ptrdiff_t o14) const KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t o14) const KLEIDICV_STREAMING { return Offsets{o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14}; } diff --git a/kleidicv/include/kleidicv/workspace/border_21x21.h b/kleidicv/include/kleidicv/workspace/border_21x21.h index df8d4cd13b073d61271f7fe8b034b007eff4741d..6ee260ea29840331029fe546adec4ae5fb7f6db2 100644 --- a/kleidicv/include/kleidicv/workspace/border_21x21.h +++ b/kleidicv/include/kleidicv/workspace/border_21x21.h @@ -63,7 +63,7 @@ class FixedBorderInfo final { : width_(width), border_type_(border_type) {} // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets_without_border() const KLEIDICV_STREAMING { return get(-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); } @@ -71,7 +71,7 @@ class FixedBorderInfo final { // NOLINTBEGIN(readability-function-cognitive-complexity) // Returns offsets for columns affected by left border. Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == 0) { @@ -222,7 +222,7 @@ class FixedBorderInfo final { // Returns offsets for columns affected by right border. Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == (width_ - 10)) { @@ -374,7 +374,7 @@ class FixedBorderInfo final { // Returns offsets for rows or columns affected by any border. Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if (row_or_column_index < 10U) { // Rows and columns have the same offsets. return offsets_with_left_border(row_or_column_index); @@ -393,7 +393,7 @@ class FixedBorderInfo final { ptrdiff_t o8, ptrdiff_t o9, ptrdiff_t o10, ptrdiff_t o11, ptrdiff_t o12, ptrdiff_t o13, ptrdiff_t o14, ptrdiff_t o15, ptrdiff_t o16, ptrdiff_t o17, ptrdiff_t o18, ptrdiff_t o19, - ptrdiff_t o20) const KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t o20) const KLEIDICV_STREAMING { return Offsets{o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, o16, o17, o18, o19, o20}; } diff --git a/kleidicv/include/kleidicv/workspace/border_3x3.h b/kleidicv/include/kleidicv/workspace/border_3x3.h index a3fc69182cc8cab56ffa65123578f119a402c11c..688dc65b38fd4c717f39252d81e7aa442f84c90d 100644 --- a/kleidicv/include/kleidicv/workspace/border_3x3.h +++ b/kleidicv/include/kleidicv/workspace/border_3x3.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -41,7 +41,7 @@ class FixedBorderInfo final { // Returns offsets for columns affected by left border. Offsets offsets_with_left_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: case FixedBorderType::REFLECT: @@ -63,7 +63,7 @@ class FixedBorderInfo final { // Returns offsets for columns affected by right border. Offsets offsets_with_right_border(size_t /* column_index */) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: case FixedBorderType::REFLECT: @@ -85,7 +85,7 @@ class FixedBorderInfo final { // Returns offsets for rows or columns affected by any border. Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if (row_or_column_index == 0U) { // Rows and columns have the same offsets. return offsets_with_left_border(row_or_column_index); diff --git a/kleidicv/include/kleidicv/workspace/border_5x5.h b/kleidicv/include/kleidicv/workspace/border_5x5.h index 8d19636ec0de2ff25954e3531c4e6bb9b5dca51c..7609fea7888a9ac8a3bcee33d2ca8c666c93615f 100644 --- a/kleidicv/include/kleidicv/workspace/border_5x5.h +++ b/kleidicv/include/kleidicv/workspace/border_5x5.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -43,13 +43,13 @@ class FixedBorderInfo final { : height_(height), border_type_(border_type) {} // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets_without_border() const KLEIDICV_STREAMING { return get(-2, -1, 0, 1, 2); } // Returns offsets for columns affected by left border. Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == 0) { @@ -90,7 +90,7 @@ class FixedBorderInfo final { // Returns offsets for columns affected by right border. Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == (height_ - 2)) { @@ -131,7 +131,7 @@ class FixedBorderInfo final { // Returns offsets for rows or columns affected by any border. Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if (row_or_column_index <= 1U) { // Rows and columns have the same offsets. return offsets_with_left_border(row_or_column_index); @@ -146,7 +146,7 @@ class FixedBorderInfo final { private: // Takes care of static signed to unsigned casts. Offsets get(ptrdiff_t o0, ptrdiff_t o1, ptrdiff_t o2, ptrdiff_t o3, - ptrdiff_t o4) const KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t o4) const KLEIDICV_STREAMING { return Offsets{o0, o1, o2, o3, o4}; } diff --git a/kleidicv/include/kleidicv/workspace/border_7x7.h b/kleidicv/include/kleidicv/workspace/border_7x7.h index 83e6d391157144a55007955f782f0195b3c73c76..39195820f10becc3e4d3c35be6bff976e1800141 100644 --- a/kleidicv/include/kleidicv/workspace/border_7x7.h +++ b/kleidicv/include/kleidicv/workspace/border_7x7.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -45,13 +45,13 @@ class FixedBorderInfo final { : height_(height), border_type_(border_type) {} // Returns offsets without the influence of any border. - Offsets offsets_without_border() const KLEIDICV_STREAMING_COMPATIBLE { + Offsets offsets_without_border() const KLEIDICV_STREAMING { return get(-3, -2, -1, 0, 1, 2, 3); } // Returns offsets for columns affected by left border. Offsets offsets_with_left_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == 0) { @@ -100,7 +100,7 @@ class FixedBorderInfo final { // Returns offsets for columns affected by right border. Offsets offsets_with_right_border(size_t column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { switch (border_type_) { case FixedBorderType::REPLICATE: if (column_index == (height_ - 3)) { @@ -149,7 +149,7 @@ class FixedBorderInfo final { // Returns offsets for rows or columns affected by any border. Offsets offsets_with_border(size_t row_or_column_index) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { if (row_or_column_index <= 2U) { // Rows and columns have the same offsets. return offsets_with_left_border(row_or_column_index); @@ -165,7 +165,7 @@ class FixedBorderInfo final { // Takes care of static signed to unsigned casts. Offsets get(ptrdiff_t o0, ptrdiff_t o1, ptrdiff_t o2, ptrdiff_t o3, ptrdiff_t o4, ptrdiff_t o5, - ptrdiff_t o6) const KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t o6) const KLEIDICV_STREAMING { return Offsets{o0, o1, o2, o3, o4, o5, o6}; } diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index f0564cd833d33f676aadc1c3432f8e2fcd366c9a..a9cb2f7aa55aa33eeb3d251214a443cc6a238200 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -20,8 +20,7 @@ class SeparableFilterWorkspace; // Deleter for SeparableFilterWorkspace instances. class SeparableFilterWorkspaceDeleter { public: - void operator()(SeparableFilterWorkspace *ptr) const - KLEIDICV_STREAMING_COMPATIBLE { + void operator()(SeparableFilterWorkspace *ptr) const KLEIDICV_STREAMING { std::free(ptr); }; }; @@ -82,8 +81,7 @@ class SeparableFilterWorkspace { // Creates a workspace on the heap. static Pointer create(Rectangle rect, size_t channels, - size_t intermediate_size) - KLEIDICV_STREAMING_COMPATIBLE { + size_t intermediate_size) KLEIDICV_STREAMING { size_t buffer_rows_number_of_elements = rect.width() * channels; // Adding more elements because of SVE, where interleaving stores are // governed by one predicate. For example, if a predicate requires 7 uint8_t @@ -129,7 +127,7 @@ class SeparableFilterWorkspace { Rows src_rows, Rows dst_rows, size_t channels, typename FilterType::BorderType border_type, - FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + FilterType filter) KLEIDICV_STREAMING { // Border helper which calculates border offsets. typename FilterType::BorderInfoType vertical_border{rect.height(), border_type}; @@ -163,7 +161,7 @@ class SeparableFilterWorkspace { Rows dst_rows, size_t channels, typename FilterType::BorderType /* border_type */, - FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + FilterType filter) KLEIDICV_STREAMING { // Buffer rows which hold intermediate widened data. auto buffer_rows = Rows{reinterpret_cast( &data_[buffer_rows_offset_]), @@ -205,7 +203,7 @@ class SeparableFilterWorkspace { Rows dst_rows, FilterType filter, typename FilterType::BorderInfoType horizontal_border) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { // Margin associated with the filter. constexpr size_t margin = filter.margin; diff --git a/kleidicv/src/analysis/min_max_sc.h b/kleidicv/src/analysis/min_max_sc.h index a42ebdff8e74e05fcbf0a764186f1165c172dad6..1a44a956aa7bb39643405cd2a6eed38f2927dbee 100644 --- a/kleidicv/src/analysis/min_max_sc.h +++ b/kleidicv/src/analysis/min_max_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -19,23 +19,21 @@ class MinMax final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; using ContextType = Context; - MinMax(VectorType &vmin, VectorType &vmax) KLEIDICV_STREAMING_COMPATIBLE - : vmin_{vmin}, - vmax_{vmax} {} + MinMax(VectorType &vmin, VectorType &vmax) KLEIDICV_STREAMING : vmin_{vmin}, + vmax_{vmax} {} - void vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + void vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { auto pg = ctx.predicate(); vmin_ = svmin_m(pg, vmin_, src); vmax_ = svmax_m(pg, vmax_, src); } - ScalarType get_min() const KLEIDICV_STREAMING_COMPATIBLE { + ScalarType get_min() const KLEIDICV_STREAMING { auto pg = VecTraits::svptrue(); return svminv(pg, vmin_); } - ScalarType get_max() const KLEIDICV_STREAMING_COMPATIBLE { + ScalarType get_max() const KLEIDICV_STREAMING { auto pg = VecTraits::svptrue(); return svmaxv(pg, vmax_); } @@ -47,8 +45,7 @@ class MinMax final : public UnrollTwice { template kleidicv_error_t min_max_sc(const ScalarType *src, size_t src_stride, size_t width, size_t height, ScalarType *min_value, - ScalarType *max_value) - KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *max_value) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/arithmetics/absdiff_sme.cpp b/kleidicv/src/arithmetics/absdiff_sme.cpp index ae44735355205998a45b4ee4cf1f3af7445b8c2e..84caf267edce2241d69b7b5ea04d174cc0988c25 100644 --- a/kleidicv/src/arithmetics/absdiff_sme.cpp +++ b/kleidicv/src/arithmetics/absdiff_sme.cpp @@ -10,7 +10,7 @@ namespace kleidicv::sme { template ::value, bool> = true> VectorType vector_path_impl(svbool_t pg, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { // Results of SABD may be outside the signed range so use two // saturating instructions instead. return svqabs_x(pg, svqsub_m(pg, src_a, src_b)); @@ -19,7 +19,7 @@ VectorType vector_path_impl(svbool_t pg, VectorType src_a, template ::value, bool> = true> VectorType vector_path_impl(svbool_t pg, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { return svabd_m(pg, src_a, src_b); } @@ -31,7 +31,7 @@ class SaturatingAbsDiff final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { return vector_path_impl(ctx.predicate(), src_a, src_b); } }; // end of class SaturatingAbsDiff diff --git a/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h b/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h index 3b8075b68f7d37fac45b24493c156b874c195a70..89e9acee2532d35dca66563874772b3529d73612 100644 --- a/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h +++ b/kleidicv/src/arithmetics/add_abs_with_threshold_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -20,10 +20,10 @@ class SaturatingAddAbsWithThreshold final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; explicit SaturatingAddAbsWithThreshold(ScalarType threshold) - KLEIDICV_STREAMING_COMPATIBLE : threshold_(threshold) {} + KLEIDICV_STREAMING : threshold_(threshold) {} VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { auto pg = ctx.predicate(); VectorType add_abs = svqadd_x(pg, svqabs_x(pg, src_a), svqabs_x(pg, src_b)); svbool_t predicate = svcmpgt(pg, add_abs, threshold_); @@ -38,7 +38,7 @@ template kleidicv_error_t saturating_add_abs_with_threshold_sc( const T *src_a, size_t src_a_stride, const T *src_b, size_t src_b_stride, T *dst, size_t dst_stride, size_t width, size_t height, - T threshold) KLEIDICV_STREAMING_COMPATIBLE { + T threshold) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src_a, src_a_stride, height); CHECK_POINTER_AND_STRIDE(src_b, src_b_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); diff --git a/kleidicv/src/arithmetics/add_api.cpp b/kleidicv/src/arithmetics/add_api.cpp index a7311412c9b5ab6fb3031ca735388280930cb59d..05f27c0c25eb1fc27c451e15630eb877fcc2e077 100644 --- a/kleidicv/src/arithmetics/add_api.cpp +++ b/kleidicv/src/arithmetics/add_api.cpp @@ -34,13 +34,33 @@ kleidicv_error_t saturating_add(const T *src_a, size_t src_a_stride, } // namespace sme +#if KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 +namespace sme2 { +template +kleidicv_error_t saturating_add(const T *src_a, size_t src_a_stride, + const T *src_b, size_t src_b_stride, T *dst, + size_t dst_stride, size_t width, size_t height); + +} // namespace sme2 + +#endif // KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 + } // namespace kleidicv +#if KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 +#define KLEIDICV_DEFINE_C_API(name, type) \ + KLEIDICV_MULTIVERSION_C_API( \ + name, &kleidicv::neon::saturating_add, \ + KLEIDICV_SVE2_IMPL_IF(&kleidicv::sve2::saturating_add), \ + KLEIDICV_SME_IMPL_IF(&kleidicv::sme::saturating_add), \ + KLEIDICV_SME2_IMPL_IF(&kleidicv::sme2::saturating_add)) +#else #define KLEIDICV_DEFINE_C_API(name, type) \ KLEIDICV_MULTIVERSION_C_API( \ name, &kleidicv::neon::saturating_add, \ KLEIDICV_SVE2_IMPL_IF(&kleidicv::sve2::saturating_add), \ KLEIDICV_SME_IMPL_IF(&kleidicv::sme::saturating_add), nullptr) +#endif // KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 KLEIDICV_DEFINE_C_API(kleidicv_saturating_add_s8, int8_t); KLEIDICV_DEFINE_C_API(kleidicv_saturating_add_u8, uint8_t); diff --git a/kleidicv/src/arithmetics/add_sc.h b/kleidicv/src/arithmetics/add_sc.h new file mode 100644 index 0000000000000000000000000000000000000000..be2c4b33cb3e932704f4957856cb3132b33e4dec --- /dev/null +++ b/kleidicv/src/arithmetics/add_sc.h @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ADD_SC_H +#define ADD_SC_H + +#include + +#include "kleidicv/kleidicv.h" +#include "kleidicv/sve2.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class SaturatingAdd final : public UnrollTwice { + public: + using ContextType = Context; + using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; + using VectorType = typename VecTraits::VectorType; + + VectorType vector_path(ContextType ctx, VectorType src_a, + VectorType src_b) KLEIDICV_STREAMING { + return svqadd_m(ctx.predicate(), src_a, src_b); + } +}; // end of class SaturatingAdd + +template +static kleidicv_error_t saturating_add_sc(const T *src_a, size_t src_a_stride, + const T *src_b, size_t src_b_stride, + T *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { + CHECK_POINTER_AND_STRIDE(src_a, src_a_stride, height); + CHECK_POINTER_AND_STRIDE(src_b, src_b_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + + SaturatingAdd operation; + Rectangle rect{width, height}; + Rows src_a_rows{src_a, src_a_stride}; + Rows src_b_rows{src_b, src_b_stride}; + Rows dst_rows{dst, dst_stride}; + apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, dst_rows); + return KLEIDICV_OK; +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // ADD_SC_H diff --git a/kleidicv/src/arithmetics/add_sme.cpp b/kleidicv/src/arithmetics/add_sme.cpp index 4022248c131da7c4f589973c498e1efb54dfc667..da6d7d6cbfa8a68dad5f612074ca63642943e916 100644 --- a/kleidicv/src/arithmetics/add_sme.cpp +++ b/kleidicv/src/arithmetics/add_sme.cpp @@ -2,42 +2,17 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - -#include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "add_sc.h" namespace kleidicv::sme { -template -class SaturatingAdd final : public UnrollTwice { - public: - using ContextType = Context; - using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - using VectorType = typename VecTraits::VectorType; - - VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { - return svqadd_m(ctx.predicate(), src_a, src_b); - } -}; // end of class SaturatingAdd - template -KLEIDICV_LOCALLY_STREAMING kleidicv_error_t saturating_add( - const T *src_a, size_t src_a_stride, const T *src_b, size_t src_b_stride, - T *dst, size_t dst_stride, size_t width, size_t height) { - CHECK_POINTER_AND_STRIDE(src_a, src_a_stride, height); - CHECK_POINTER_AND_STRIDE(src_b, src_b_stride, height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); - CHECK_IMAGE_SIZE(width, height); - - SaturatingAdd operation; - Rectangle rect{width, height}; - Rows src_a_rows{src_a, src_a_stride}; - Rows src_b_rows{src_b, src_b_stride}; - Rows dst_rows{dst, dst_stride}; - apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, dst_rows); - return KLEIDICV_OK; +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +saturating_add(const T *src_a, size_t src_a_stride, const T *src_b, + size_t src_b_stride, T *dst, size_t dst_stride, size_t width, + size_t height) { + return saturating_add_sc(src_a, src_a_stride, src_b, src_b_stride, dst, + dst_stride, width, height); } #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ diff --git a/kleidicv/src/arithmetics/add_sme2.cpp b/kleidicv/src/arithmetics/add_sme2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..72568dc44a17bf01279108508b90cf910da4d841 --- /dev/null +++ b/kleidicv/src/arithmetics/add_sme2.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "add_sc.h" + +#if KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 + +namespace kleidicv::sme2 { + +template +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +saturating_add(const T *src_a, size_t src_a_stride, const T *src_b, + size_t src_b_stride, T *dst, size_t dst_stride, size_t width, + size_t height) { + return saturating_add_sc(src_a, src_a_stride, src_b, src_b_stride, dst, + dst_stride, width, height); +} + +#define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t saturating_add( \ + const type *src_a, size_t src_a_stride, const type *src_b, \ + size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ + size_t height) + +KLEIDICV_INSTANTIATE_TEMPLATE(int8_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int16_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint16_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int32_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint32_t); +KLEIDICV_INSTANTIATE_TEMPLATE(int64_t); +KLEIDICV_INSTANTIATE_TEMPLATE(uint64_t); + +} // namespace kleidicv::sme2 + +#endif // KLEIDICV_EXPERIMENTAL_FEATURE_ADD_SME2 diff --git a/kleidicv/src/arithmetics/add_sve2.cpp b/kleidicv/src/arithmetics/add_sve2.cpp index 670237eb65cc31a08faceec302ae810d91af5837..ebbc147b5054d516cd676e693866860c2a0db852 100644 --- a/kleidicv/src/arithmetics/add_sve2.cpp +++ b/kleidicv/src/arithmetics/add_sve2.cpp @@ -1,43 +1,17 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 -#include - -#include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" +#include "add_sc.h" namespace kleidicv::sve2 { -template -class SaturatingAdd final : public UnrollTwice { - public: - using ContextType = Context; - using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - using VectorType = typename VecTraits::VectorType; - - VectorType vector_path(ContextType ctx, VectorType src_a, VectorType src_b) { - return svqadd_m(ctx.predicate(), src_a, src_b); - } -}; // end of class SaturatingAdd - template -kleidicv_error_t saturating_add(const T *src_a, size_t src_a_stride, - const T *src_b, size_t src_b_stride, T *dst, - size_t dst_stride, size_t width, - size_t height) { - CHECK_POINTER_AND_STRIDE(src_a, src_a_stride, height); - CHECK_POINTER_AND_STRIDE(src_b, src_b_stride, height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); - CHECK_IMAGE_SIZE(width, height); - - SaturatingAdd operation; - Rectangle rect{width, height}; - Rows src_a_rows{src_a, src_a_stride}; - Rows src_b_rows{src_b, src_b_stride}; - Rows dst_rows{dst, dst_stride}; - apply_operation_by_rows(operation, rect, src_a_rows, src_b_rows, dst_rows); - return KLEIDICV_OK; +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t saturating_add( + const T *src_a, size_t src_a_stride, const T *src_b, size_t src_b_stride, + T *dst, size_t dst_stride, size_t width, size_t height) { + return saturating_add_sc(src_a, src_a_stride, src_b, src_b_stride, dst, + dst_stride, width, height); } #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ diff --git a/kleidicv/src/arithmetics/compare_sc.h b/kleidicv/src/arithmetics/compare_sc.h index 96f8ecaee41369cdeed273f896164189f5f3bbc9..b799b8838a714e3b25bdae05908e9bfa2bcd4714 100644 --- a/kleidicv/src/arithmetics/compare_sc.h +++ b/kleidicv/src/arithmetics/compare_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -23,7 +23,7 @@ class ComparatorEqual : public UnrollTwice { // NOLINTBEGIN(readability-make-member-function-const) VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { svbool_t pg = ctx.predicate(); svbool_t predicate = svcmpeq(pg, src_a, src_b); return svsel(predicate, VecTraits::svdup(255), VecTraits::svdup(0)); @@ -44,7 +44,7 @@ class ComparatorGreater : public UnrollTwice { // NOLINTBEGIN(readability-make-member-function-const) VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { svbool_t pg = ctx.predicate(); svbool_t predicate = svcmpgt(pg, src_a, src_b); return svsel(predicate, VecTraits::svdup(255), VecTraits::svdup(0)); @@ -56,7 +56,7 @@ template kleidicv_error_t compare_sc(const ScalarType *src_a, size_t src_a_stride, const ScalarType *src_b, size_t src_b_stride, ScalarType *dst, size_t dst_stride, size_t width, - size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src_a, src_a_stride, height); CHECK_POINTER_AND_STRIDE(src_b, src_b_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); diff --git a/kleidicv/src/arithmetics/exp_sc.h b/kleidicv/src/arithmetics/exp_sc.h index 41f9f813ec0048df8b480f3488f82cd8b4db068f..e7174da68cc353eeae8ef625a8b71ae10ec93d2a 100644 --- a/kleidicv/src/arithmetics/exp_sc.h +++ b/kleidicv/src/arithmetics/exp_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -20,8 +20,7 @@ class Exp final : public UnrollOnce { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; - VectorType vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { svfloat32_t n, r, poly, z; svuint32_t e; @@ -56,7 +55,7 @@ class Exp final : public UnrollOnce { private: static svfloat32_t specialcase(svbool_t pg, svfloat32_t poly, svfloat32_t n, - svuint32_t e) KLEIDICV_STREAMING_COMPATIBLE { + svuint32_t e) KLEIDICV_STREAMING { /* 2^n may overflow, break it up into s1*s2. */ svuint32_t b = svsel(svcmple(pg, n, svdup_f32(0.0F)), svdup_u32(0x83000000U), svdup_u32(0.0F)); @@ -79,7 +78,7 @@ using ExpTryShortPath = Exp; template static kleidicv_error_t exp_sc(const T* src, size_t src_stride, T* dst, size_t dst_stride, size_t width, - size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/arithmetics/in_range_sc.h b/kleidicv/src/arithmetics/in_range_sc.h index 9b6710e0802446bfacb0c239b7622a2310892508..f7d0c9616ae392546e884f0340f882c6fff8cdf9 100644 --- a/kleidicv/src/arithmetics/in_range_sc.h +++ b/kleidicv/src/arithmetics/in_range_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -25,13 +25,12 @@ class InRange : public UnrollTwice { using SignedVectorType = typename SignedVecTraits::VectorType; InRange(VectorType &vec_lower_bound, - VectorType &vec_upper_bound) KLEIDICV_STREAMING_COMPATIBLE + VectorType &vec_upper_bound) KLEIDICV_STREAMING : vec_lower_bound_(vec_lower_bound), vec_upper_bound_(vec_upper_bound) {} // NOLINTBEGIN(readability-make-member-function-const) - VectorType vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { svbool_t pg = ctx.predicate(); VectorType diff_low = svsub_x(pg, src, vec_lower_bound_); @@ -69,14 +68,14 @@ class InRange { using DstVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using DstVectorType = typename DstVecTraits::VectorType; - InRange(float lower_bound, float upper_bound) KLEIDICV_STREAMING_COMPATIBLE + InRange(float lower_bound, float upper_bound) KLEIDICV_STREAMING : lower_bound_(lower_bound), upper_bound_(upper_bound) {} void process_row(size_t width, Columns src, - Columns dst) KLEIDICV_STREAMING_COMPATIBLE { + Columns dst) KLEIDICV_STREAMING { LoopUnroll{width, SrcVecTraits::num_lanes()} - .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING { svbool_t pg_src = SrcVecTraits::svptrue(); SrcVectorType src_v0 = svld1(pg_src, &src[0]); SrcVectorType src_v1 = svld1_vnum(pg_src, &src[0], 1); @@ -89,7 +88,7 @@ class InRange { src += ptrdiff_t(step); dst += ptrdiff_t(step); }) - .remaining([&](size_t length, size_t) KLEIDICV_STREAMING_COMPATIBLE { + .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { size_t index = 0; svbool_t pg = SrcVecTraits::svwhilelt(index, length); while (svptest_first(SrcVecTraits::svptrue(), pg)) { @@ -108,7 +107,7 @@ class InRange { // NOLINTBEGIN(readability-make-member-function-const) DstVectorType vector_path(svbool_t full_pg, SrcVectorType fsrc0, SrcVectorType fsrc1, SrcVectorType fsrc2, - SrcVectorType fsrc3) KLEIDICV_STREAMING_COMPATIBLE { + SrcVectorType fsrc3) KLEIDICV_STREAMING { svbool_t pred0 = svand_z(full_pg, svcmpge(full_pg, fsrc0, lower_bound_), svcmple(full_pg, fsrc0, upper_bound_)); auto res00 = svsel(pred0, svdup_u32(0xFF), svdup_u32(0)); @@ -134,8 +133,8 @@ class InRange { // NOLINTEND(readability-make-member-function-const) // NOLINTBEGIN(readability-make-member-function-const) - DstVectorType remaining_path(svbool_t &pg, SrcVectorType src) - KLEIDICV_STREAMING_COMPATIBLE { + DstVectorType remaining_path(svbool_t &pg, + SrcVectorType src) KLEIDICV_STREAMING { svbool_t predicate = svand_z(pg, svcmpge(pg, src, lower_bound_), svcmple(pg, src, upper_bound_)); return svsel(predicate, DstVecTraits::svdup(0xFF), DstVecTraits::svdup(0)); @@ -149,8 +148,7 @@ class InRange { template kleidicv_error_t in_range_sc(const T *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, - T lower_bound, - T upper_bound) KLEIDICV_STREAMING_COMPATIBLE { + T lower_bound, T upper_bound) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/arithmetics/scale_sc.h b/kleidicv/src/arithmetics/scale_sc.h index 269c15b4c6a18f35628a42fa753845f156fa16ef..3c74e37a4e1a60d00cf96c41ffa3e492af62d350 100644 --- a/kleidicv/src/arithmetics/scale_sc.h +++ b/kleidicv/src/arithmetics/scale_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -16,12 +16,11 @@ class AddFloat final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; - explicit AddFloat(const svfloat32_t &svshift) KLEIDICV_STREAMING_COMPATIBLE + explicit AddFloat(const svfloat32_t &svshift) KLEIDICV_STREAMING : svshift_{svshift} {} // NOLINTBEGIN(readability-make-member-function-const) - VectorType vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { return svadd_x(ctx.predicate(), src, svshift_); } // NOLINTEND(readability-make-member-function-const) @@ -37,13 +36,12 @@ class ScaleFloat final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; ScaleFloat(const svfloat32_t &svscale, - const svfloat32_t &svshift) KLEIDICV_STREAMING_COMPATIBLE + const svfloat32_t &svshift) KLEIDICV_STREAMING : svscale_{svscale}, svshift_{svshift} {} // NOLINTBEGIN(readability-make-member-function-const) - VectorType vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { return svmla_x(ctx.predicate(), svshift_, src, svscale_); } // NOLINTEND(readability-make-member-function-const) @@ -55,15 +53,13 @@ class ScaleFloat final : public UnrollTwice { template kleidicv_error_t scale_sc(const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, size_t height, - float scale, - float shift) KLEIDICV_STREAMING_COMPATIBLE; + float scale, float shift) KLEIDICV_STREAMING; // Specialization for float template <> kleidicv_error_t scale_sc(const float *src, size_t src_stride, float *dst, size_t dst_stride, size_t width, size_t height, - float scale, - float shift) KLEIDICV_STREAMING_COMPATIBLE { + float scale, float shift) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/arithmetics/sub_sme.cpp b/kleidicv/src/arithmetics/sub_sme.cpp index 0bd22f3e9224aebc64fabeacea1c88530199c513..f4756aa7df1054818a428b8a17fc93f3a579f5f1 100644 --- a/kleidicv/src/arithmetics/sub_sme.cpp +++ b/kleidicv/src/arithmetics/sub_sme.cpp @@ -17,7 +17,7 @@ class SaturatingSub final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { return svqsub_m(ctx.predicate(), src_a, src_b); } }; // end of class SaturatingSub diff --git a/kleidicv/src/arithmetics/sum_sc.h b/kleidicv/src/arithmetics/sum_sc.h index 71c5d831ee9aa75f56fd03cf7d88e399a70e86cd..6efe118ef33a629c56df6a6cf504a41625d5902f 100644 --- a/kleidicv/src/arithmetics/sum_sc.h +++ b/kleidicv/src/arithmetics/sum_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -26,13 +26,12 @@ class Sum final : public UnrollTwice { KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorTypeInternal = typename VecTraitsInternal::VectorType; - explicit Sum(VectorTypeInternal &accumulator) KLEIDICV_STREAMING_COMPATIBLE + explicit Sum(VectorTypeInternal &accumulator) KLEIDICV_STREAMING : accumulator_{accumulator} { accumulator_ = VecTraitsInternal::svdup(0); } - void vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + void vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { VectorTypeInternal src_widened_evens = svcvt_f64_f32_x(VecTraits::svptrue(), src); VectorTypeInternal src_widened_odds = @@ -42,7 +41,7 @@ class Sum final : public UnrollTwice { svadd_m(ctx.predicate(), src_widened_evens, src_widened_odds)); } - ScalarType get_sum() const KLEIDICV_STREAMING_COMPATIBLE { + ScalarType get_sum() const KLEIDICV_STREAMING { ScalarTypeInternal accumulator_final[VecTraitsInternal::max_num_lanes()] = { 0}; svst1(VecTraitsInternal::svptrue(), accumulator_final, accumulator_); @@ -60,7 +59,7 @@ class Sum final : public UnrollTwice { template kleidicv_error_t sum_sc(const T *src, size_t src_stride, size_t width, - size_t height, T *sum) KLEIDICV_STREAMING_COMPATIBLE { + size_t height, T *sum) KLEIDICV_STREAMING { using VecTraitsInternal = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorTypeInternal = typename VecTraitsInternal::VectorType; diff --git a/kleidicv/src/arithmetics/threshold_sc.h b/kleidicv/src/arithmetics/threshold_sc.h index 2868b9b2751c69b4d5509d660910cc2d2b0adf19..0b1c8f93018887fdea166f9cbc14eb7766cdef62 100644 --- a/kleidicv/src/arithmetics/threshold_sc.h +++ b/kleidicv/src/arithmetics/threshold_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -17,13 +17,11 @@ class BinaryThreshold final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; - BinaryThreshold(ScalarType threshold, - ScalarType value) KLEIDICV_STREAMING_COMPATIBLE + BinaryThreshold(ScalarType threshold, ScalarType value) KLEIDICV_STREAMING : threshold_(threshold), value_(value) {} - VectorType vector_path(ContextType ctx, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(ContextType ctx, VectorType src) KLEIDICV_STREAMING { svbool_t predicate = svcmpgt(ctx.predicate(), src, threshold_); return svsel_u8(predicate, svdup_u8(value_), svdup_u8(0)); } @@ -37,7 +35,7 @@ template kleidicv_error_t threshold_binary_sc(const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, size_t height, T threshold, - T value) KLEIDICV_STREAMING_COMPATIBLE { + T value) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/conversions/float_conv_sc.h b/kleidicv/src/conversions/float_conv_sc.h index 35a429a831455f190ec77bb10062fdc4c666e84f..6c47397376fd38c9f3c2c5ffee5a153959533851 100644 --- a/kleidicv/src/conversions/float_conv_sc.h +++ b/kleidicv/src/conversions/float_conv_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -27,8 +27,8 @@ class float_conversion_operation { using DstVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using DstVectorType = typename DstVecTraits::VectorType; - explicit float_conversion_operation(svuint8_t& index) - KLEIDICV_STREAMING_COMPATIBLE : index_(index) { + explicit float_conversion_operation(svuint8_t& index) KLEIDICV_STREAMING + : index_(index) { // Index generation to reorder converted values by tbl instruction auto index0 = svindex_u8(0, 4); auto index1 = svindex_u8(1, 4); @@ -44,9 +44,9 @@ class float_conversion_operation { } void process_row(size_t width, Columns src, - Columns dst) KLEIDICV_STREAMING_COMPATIBLE { + Columns dst) KLEIDICV_STREAMING { LoopUnroll{width, SrcVecTraits::num_lanes()} - .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING { svbool_t pg = DstVecTraits::svptrue(); SrcVectorType src_v0 = svld1(pg, &src[0]); SrcVectorType src_v1 = svld1_vnum(pg, &src[0], 1); @@ -57,7 +57,7 @@ class float_conversion_operation { src += ptrdiff_t(step); dst += ptrdiff_t(step); }) - .remaining([&](size_t length, size_t) KLEIDICV_STREAMING_COMPATIBLE { + .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { size_t index = 0; svbool_t pg = SrcVecTraits::svwhilelt(index, length); while (svptest_first(SrcVecTraits::svptrue(), pg)) { @@ -77,7 +77,7 @@ class float_conversion_operation { typename O, std::enable_if_t && std::is_signed_v, int> = 0> decltype(auto) convert(svbool_t full_pg, - SrcVectorType in) KLEIDICV_STREAMING_COMPATIBLE { + SrcVectorType in) KLEIDICV_STREAMING { return svcvt_s32_f32_x(full_pg, in); } @@ -85,13 +85,13 @@ class float_conversion_operation { typename O, std::enable_if_t && !std::is_signed_v, int> = 0> decltype(auto) convert(svbool_t full_pg, - SrcVectorType in) KLEIDICV_STREAMING_COMPATIBLE { + SrcVectorType in) KLEIDICV_STREAMING { return svcvt_u32_f32_x(full_pg, in); } DstVectorType vector_path(svbool_t full_pg, SrcVectorType fsrc0, SrcVectorType fsrc1, SrcVectorType fsrc2, - SrcVectorType fsrc3) KLEIDICV_STREAMING_COMPATIBLE { + SrcVectorType fsrc3) KLEIDICV_STREAMING { fsrc0 = svrinti_f32_x(full_pg, fsrc0); fsrc1 = svrinti_f32_x(full_pg, fsrc1); fsrc2 = svrinti_f32_x(full_pg, fsrc2); @@ -117,8 +117,8 @@ class float_conversion_operation { template < typename O, std::enable_if_t && std::is_signed_v, int> = 0> - IntermediateVectorType remaining_path(svbool_t& pg, SrcVectorType src) - KLEIDICV_STREAMING_COMPATIBLE { + IntermediateVectorType remaining_path(svbool_t& pg, + SrcVectorType src) KLEIDICV_STREAMING { constexpr float min_val = std::numeric_limits::lowest(); constexpr float max_val = std::numeric_limits::max(); @@ -136,8 +136,8 @@ class float_conversion_operation { template < typename O, std::enable_if_t && !std::is_signed_v, int> = 0> - IntermediateVectorType remaining_path(svbool_t& pg, SrcVectorType src) - KLEIDICV_STREAMING_COMPATIBLE { + IntermediateVectorType remaining_path(svbool_t& pg, + SrcVectorType src) KLEIDICV_STREAMING { constexpr float max_val = std::numeric_limits::max(); src = svrinti_f32_x(pg, src); @@ -160,9 +160,9 @@ class float_conversion_operation { explicit float_conversion_operation(svuint8_t&) {} void process_row(size_t width, Columns src, - Columns dst) KLEIDICV_STREAMING_COMPATIBLE { + Columns dst) KLEIDICV_STREAMING { LoopUnroll{width, VecTraits::num_lanes()} - .unroll_twice([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_twice([&](size_t step) KLEIDICV_STREAMING { svbool_t pg = VecTraits::svptrue(); auto src_vect1 = load_src(pg, &src[0], 0); auto src_vect2 = load_src(pg, &src[0], 1); @@ -174,7 +174,7 @@ class float_conversion_operation { src += ptrdiff_t(step); dst += ptrdiff_t(step); }) - .remaining([&](size_t length, size_t) KLEIDICV_STREAMING_COMPATIBLE { + .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { size_t index = 0; svbool_t pg = VecTraits::svwhilelt(index, length); while (svptest_first(VecTraits::svptrue(), pg)) { @@ -190,14 +190,12 @@ class float_conversion_operation { private: template , int> = 0> - VectorType vector_path(svbool_t& pg, - I src_vector) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(svbool_t& pg, I src_vector) KLEIDICV_STREAMING { return svcvt_f32_s32_x(pg, src_vector); } template , int> = 0> - VectorType vector_path(svbool_t& pg, - I src_vector) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(svbool_t& pg, I src_vector) KLEIDICV_STREAMING { return svcvt_f32_u32_x(pg, src_vector); } @@ -205,7 +203,7 @@ class float_conversion_operation { typename I, std::enable_if_t && std::is_signed_v, int> = 0> svint32_t load_src(svbool_t& pg, const I* src, - size_t vnum) KLEIDICV_STREAMING_COMPATIBLE { + size_t vnum) KLEIDICV_STREAMING { svint32_t src_vect = svld1sb_vnum_s32(pg, src, vnum); return src_vect; } @@ -214,16 +212,17 @@ class float_conversion_operation { typename I, std::enable_if_t && !std::is_signed_v, int> = 0> svuint32_t load_src(svbool_t& pg, const I* src, - size_t vnum) KLEIDICV_STREAMING_COMPATIBLE { + size_t vnum) KLEIDICV_STREAMING { svuint32_t src_vect = svld1ub_vnum_u32(pg, src, vnum); return src_vect; } }; // end of class float_conversion_operation template -static kleidicv_error_t float_conversion_sc( - const InputType* src, size_t src_stride, OutputType* dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t float_conversion_sc(const InputType* src, + size_t src_stride, OutputType* dst, + size_t dst_stride, size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/conversions/gray_to_rgb_sc.h b/kleidicv/src/conversions/gray_to_rgb_sc.h index ca635694caec2b15cdace0dab4c41963f5d12c66..fed681ab09a6d91d68e04997fda10c469c5383eb 100644 --- a/kleidicv/src/conversions/gray_to_rgb_sc.h +++ b/kleidicv/src/conversions/gray_to_rgb_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -24,26 +24,26 @@ class GrayToRGB final : #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE void vector_path(ContextType ctx, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x3_t dst_vect = svcreate3(src_vect, src_vect, src_vect); svst3(pg, dst, dst_vect); } #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE - explicit GrayToRGB(svuint8x3_t &indices) KLEIDICV_STREAMING_COMPATIBLE + explicit GrayToRGB(svuint8x3_t &indices) KLEIDICV_STREAMING : indices_{indices} { initialize_indices(); } void vector_path(ContextType ctx, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { // Call the common vector path. auto pg = ctx.predicate(); common_vector_path(pg, pg, pg, src_vect, dst); } void tail_path(ContextType ctx, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); // Predicates for consecutive stores. svbool_t pg_0, pg_1, pg_2; @@ -55,7 +55,7 @@ class GrayToRGB final : private: void common_vector_path(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { // Convert from gray to RGB using table-lookups. VectorType dst_vec_0 = svtbl(src_vect, svget3(indices_, 0)); VectorType dst_vec_1 = svtbl(src_vect, svget3(indices_, 1)); @@ -66,7 +66,7 @@ class GrayToRGB final : svst1_vnum(pg_2, &dst[0], 2, dst_vec_2); } - void initialize_indices() KLEIDICV_STREAMING_COMPATIBLE { + void initialize_indices() KLEIDICV_STREAMING { // All-true predicate to shorten code. svbool_t pg_all = VecTraits::svptrue(); // Constant used for division by 3. @@ -109,7 +109,7 @@ class GrayToRGBAWithInterleaving final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; void vector_path(ContextType ctx, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8_t alpha = svdup_u8(0xff); svuint8x4_t dst_vect = svcreate4(src_vect, src_vect, src_vect, alpha); @@ -126,20 +126,20 @@ class GrayToRGBAWithLookUpTable final : public UnrollTwice, using ContextType = Context; using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; - explicit GrayToRGBAWithLookUpTable(svuint8x4_t &indices) - KLEIDICV_STREAMING_COMPATIBLE : indices_{indices} { + explicit GrayToRGBAWithLookUpTable(svuint8x4_t &indices) KLEIDICV_STREAMING + : indices_{indices} { initialize_indices(); } void vector_path(ContextType ctx, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { // Call the common vector path. auto pg = ctx.predicate(); common_vector_path(pg, pg, pg, pg, src_vect, dst); } void tail_path(ContextType ctx, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); // Predicates for consecutive stores. svbool_t pg_0, pg_1, pg_2, pg_3; @@ -151,7 +151,7 @@ class GrayToRGBAWithLookUpTable final : public UnrollTwice, private: void common_vector_path(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2, svbool_t pg_3, VectorType src_vect, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { svuint8x2_t src_and_alpha = svcreate2(src_vect, VecTraits::svdup(-1)); // Convert from gray to RGBA using table-lookups. @@ -166,7 +166,7 @@ class GrayToRGBAWithLookUpTable final : public UnrollTwice, svst1_vnum(pg_3, &dst[0], 3, dst_vec_3); } - void initialize_indices() KLEIDICV_STREAMING_COMPATIBLE { + void initialize_indices() KLEIDICV_STREAMING { // Number of four-tuple elements. uint64_t num_four_tuples = VecTraits::num_lanes() / 4; // Index of alpha. @@ -203,7 +203,7 @@ class GrayToRGBAWithLookUpTable final : public UnrollTwice, KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t gray_to_rgb_u8_sc( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t width, size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -223,7 +223,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t gray_to_rgb_u8_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t gray_to_rgba_u8_sc( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t width, size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/conversions/rgb_to_rgb_sc.h b/kleidicv/src/conversions/rgb_to_rgb_sc.h index e07c6a93c1fc401ce1e722cba150adb1ada38fdc..91392bd9d6de0f1ab7f943915b73bbe4043d9022 100644 --- a/kleidicv/src/conversions/rgb_to_rgb_sc.h +++ b/kleidicv/src/conversions/rgb_to_rgb_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -24,7 +24,7 @@ class RGBToBGR final : #if KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE || !KLEIDICV_ASSUME_128BIT_SVE2 void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x3_t src_vect = svld3(pg, src); svuint8x3_t dst_vect = svcreate3(svget3(src_vect, 2), svget3(src_vect, 1), @@ -34,20 +34,20 @@ class RGBToBGR final : } #else // KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE || // !KLEIDICV_ASSUME_128BIT_SVE2 - explicit RGBToBGR(svuint8x4_t &indices) KLEIDICV_STREAMING_COMPATIBLE + explicit RGBToBGR(svuint8x4_t &indices) KLEIDICV_STREAMING : indices_{indices} { initialize_indices(); } void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { // Call the common vector path. auto pg = ctx.predicate(); common_vector_path(pg, pg, pg, src, dst); } void tail_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); // Predicates for consecutive stores. svbool_t pg_0, pg_1, pg_2; @@ -59,7 +59,7 @@ class RGBToBGR final : private: void common_vector_path(svbool_t pg_0, svbool_t pg_1, svbool_t pg_2, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { VectorType src_0 = svld1(pg_0, &src[0]); VectorType src_1 = svld1_vnum(pg_1, &src[0], 1); VectorType src_2 = svld1_vnum(pg_2, &src[0], 2); @@ -78,7 +78,7 @@ class RGBToBGR final : svst1_vnum(pg_2, &dst[0], 2, dst_vec_2); } - void initialize_indices() KLEIDICV_STREAMING_COMPATIBLE { + void initialize_indices() KLEIDICV_STREAMING { svbool_t pg = VecTraits::svptrue(); indices_ = svcreate4(svld1(pg, &kTableIndices[0]), svld1_vnum(pg, &kTableIndices[0], 1), @@ -105,7 +105,7 @@ class RGBAToBGRA final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x4_t src_vect = svld4(pg, src); svuint8x4_t dst_vect = svcreate4(svget4(src_vect, 2), svget4(src_vect, 1), @@ -122,7 +122,7 @@ class RGBToBGRA final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x3_t src_vect = svld3(pg, src); svuint8x4_t dst_vect = svcreate4(svget3(src_vect, 2), svget3(src_vect, 1), @@ -139,7 +139,7 @@ class RGBToRGBA final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x3_t src_vect = svld3(pg, src); svuint8x4_t dst_vect = svcreate4(svget3(src_vect, 0), svget3(src_vect, 1), @@ -156,7 +156,7 @@ class RGBAToBGR final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x4_t src_vect = svld4(pg, src); svuint8x3_t dst_vect = svcreate3(svget4(src_vect, 2), svget4(src_vect, 1), @@ -173,7 +173,7 @@ class RGBAToRGB final : public UnrollTwice { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x4_t src_vect = svld4(pg, src); svuint8x3_t dst_vect = svcreate3(svget4(src_vect, 0), svget4(src_vect, 1), @@ -185,7 +185,7 @@ class RGBAToRGB final : public UnrollTwice { KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t rgb_to_bgr_u8_sc( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t width, size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -204,9 +204,10 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t rgb_to_bgr_u8_sc( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgba_to_bgra_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgba_to_bgra_u8_sc(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -220,9 +221,10 @@ static kleidicv_error_t rgba_to_bgra_u8_sc( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgb_to_bgra_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgb_to_bgra_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -236,9 +238,10 @@ static kleidicv_error_t rgb_to_bgra_u8_sc( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgb_to_rgba_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgb_to_rgba_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -252,9 +255,10 @@ static kleidicv_error_t rgb_to_rgba_u8_sc( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgba_to_bgr_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgba_to_bgr_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -268,9 +272,10 @@ static kleidicv_error_t rgba_to_bgr_u8_sc( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgba_to_rgb_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgba_to_rgb_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/conversions/rgb_to_yuv_sc.h b/kleidicv/src/conversions/rgb_to_yuv_sc.h index 99059a6df89feee4b8a54d1780a724d4f90d2829..c152ade8a3efb10363aff4e931734edb8430d885 100644 --- a/kleidicv/src/conversions/rgb_to_yuv_sc.h +++ b/kleidicv/src/conversions/rgb_to_yuv_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -25,7 +25,7 @@ class RGBToYUVBase : public UnrollOnce { void vector_calculation_path(svbool_t pg, svint16_t r_0, svint16_t r_1, svint16_t g_0, svint16_t g_1, svint16_t b_0, svint16_t b_1, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { // Compute Y value in 32-bit precision svint16_t y_0, y_1; { @@ -102,8 +102,8 @@ class RGBToYUVBase : public UnrollOnce { static constexpr size_t b_index_ = BGR ? 0 : 2; static constexpr uint32_t half_ = (std::numeric_limits::max() / 2 + 1U) << kWeightScale; - static svint16_t combine_scaled_s16(svint32_t even, svint32_t odd) - KLEIDICV_STREAMING_COMPATIBLE { + static svint16_t combine_scaled_s16(svint32_t even, + svint32_t odd) KLEIDICV_STREAMING { return svqrshrnt(svqrshrnb(even, kWeightScale), odd, kWeightScale); } }; // end of class RGBToYUVBase @@ -117,12 +117,10 @@ class RGBToYUV final : public RGBToYUVBase { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; // Returns the number of channels in the output image. - static constexpr size_t input_channels() KLEIDICV_STREAMING_COMPATIBLE { - return 3; - } + static constexpr size_t input_channels() KLEIDICV_STREAMING { return 3; } void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svuint8x3_t svsrc = svld3(pg, src); svint16_t r_0 = svreinterpret_s16_u16(svmovlb(svget3(svsrc, r_index_))); @@ -149,7 +147,7 @@ class RGBAToYUV final : public RGBToYUVBase, public UsesTailPath { using ScalarType = uint8_t; using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - explicit RGBAToYUV(svuint8x4_t &sv4) KLEIDICV_STREAMING_COMPATIBLE + explicit RGBAToYUV(svuint8x4_t &sv4) KLEIDICV_STREAMING : deinterleave16_indices_(sv4) { // clang-format off // From the unzipped RGBA -> RBRBRBRB..., take it apart to even and odd @@ -167,18 +165,16 @@ class RGBAToYUV final : public RGBToYUVBase, public UsesTailPath { } // Returns the number of channels in the output image. - static constexpr size_t input_channels() KLEIDICV_STREAMING_COMPATIBLE { - return 4; - } + static constexpr size_t input_channels() KLEIDICV_STREAMING { return 4; } void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); common_vector_path(pg, pg, pg, pg, pg, src, dst); } void tail_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); svbool_t pg_0, pg_1, pg_2, pg_3; VecTraits::make_consecutive_predicates(pg, pg_0, pg_1, pg_2, pg_3); @@ -188,7 +184,7 @@ class RGBAToYUV final : public RGBToYUVBase, public UsesTailPath { private: void common_vector_path(svbool_t pg, svbool_t pg_0, svbool_t pg_1, svbool_t pg_2, svbool_t pg_3, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { svint16_t r_0, r_1, g_0, g_1, b_0, b_1; svuint8_t src0 = svld1(pg_0, src); @@ -247,10 +243,11 @@ class RGBAToYUV final : public RGBToYUVBase, public UsesTailPath { }; // end of class RGBAToYUV template -kleidicv_error_t rgb2yuv_operation( - OperationType operation, const ScalarType *src, size_t src_stride, - ScalarType *dst, size_t dst_stride, size_t width, - size_t height) KLEIDICV_STREAMING_COMPATIBLE { +kleidicv_error_t rgb2yuv_operation(OperationType operation, + const ScalarType *src, size_t src_stride, + ScalarType *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -264,18 +261,20 @@ kleidicv_error_t rgb2yuv_operation( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgb_to_yuv_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgb_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { RGBToYUV operation; return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, height); } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t rgba_to_yuv_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t rgba_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { svuint8x4_t indices; RGBAToYUV operation(indices); return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, @@ -283,18 +282,20 @@ static kleidicv_error_t rgba_to_yuv_u8_sc( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t bgr_to_yuv_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t bgr_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { RGBToYUV operation; return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, height); } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t bgra_to_yuv_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t bgra_to_yuv_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { svuint8x4_t indices; RGBAToYUV operation(indices); return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width, diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h index 8998e54bbe7f6ffbc086e4a473124e9917d89475..201787117cb948c00d6e74ca26a23045520f6ef2 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -17,11 +17,11 @@ class YUVSpToRGBxOrBGRx final { using ContextType = Context; using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - explicit YUVSpToRGBxOrBGRx(bool is_nv21) KLEIDICV_STREAMING_COMPATIBLE + explicit YUVSpToRGBxOrBGRx(bool is_nv21) KLEIDICV_STREAMING : is_nv21_(is_nv21) {} // Returns the number of channels in the output image. - static constexpr size_t output_channels() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr size_t output_channels() KLEIDICV_STREAMING { return ALPHA ? /* RGBA */ 4 : /* RGB */ 3; } @@ -30,7 +30,7 @@ class YUVSpToRGBxOrBGRx final { void vector_path(ContextType ctx, const uint8_t *y_row_0, const uint8_t *y_row_1, const uint8_t *uv_row, uint8_t *rgbx_row_0, - uint8_t *rgbx_row_1) KLEIDICV_STREAMING_COMPATIBLE { + uint8_t *rgbx_row_1) KLEIDICV_STREAMING { auto pg = ctx.predicate(); // Both the rounding shift right constant and the -128 value are included. @@ -189,8 +189,7 @@ template kleidicv_error_t yuv2rgbx_operation( OperationType &operation, const ScalarType *src_y, size_t src_y_stride, const ScalarType *src_uv, size_t src_uv_stride, ScalarType *dst, - size_t dst_stride, size_t width, - size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride, size_t width, size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src_y, src_y_stride, height); CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); @@ -215,7 +214,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t yuv_sp_to_rgb_u8_sc( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, - size_t height, bool is_nv21) KLEIDICV_STREAMING_COMPATIBLE { + size_t height, bool is_nv21) KLEIDICV_STREAMING { YUVSpToRGB operation{is_nv21}; return yuv2rgbx_operation(operation, src_y, src_y_stride, src_uv, src_uv_stride, dst, dst_stride, width, height); @@ -225,7 +224,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t yuv_sp_to_rgba_u8_sc( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, - size_t height, bool is_nv21) KLEIDICV_STREAMING_COMPATIBLE { + size_t height, bool is_nv21) KLEIDICV_STREAMING { YUVSpToRGBA operation{is_nv21}; return yuv2rgbx_operation(operation, src_y, src_y_stride, src_uv, src_uv_stride, dst, dst_stride, width, height); @@ -235,7 +234,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t yuv_sp_to_bgr_u8_sc( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, - size_t height, bool is_nv21) KLEIDICV_STREAMING_COMPATIBLE { + size_t height, bool is_nv21) KLEIDICV_STREAMING { YUVSpToBGR operation{is_nv21}; return yuv2rgbx_operation(operation, src_y, src_y_stride, src_uv, src_uv_stride, dst, dst_stride, width, height); @@ -245,7 +244,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t yuv_sp_to_bgra_u8_sc( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, - size_t height, bool is_nv21) KLEIDICV_STREAMING_COMPATIBLE { + size_t height, bool is_nv21) KLEIDICV_STREAMING { YUVSpToBGRA operation{is_nv21}; return yuv2rgbx_operation(operation, src_y, src_y_stride, src_uv, src_uv_stride, dst, dst_stride, width, height); diff --git a/kleidicv/src/conversions/yuv_to_rgb_sc.h b/kleidicv/src/conversions/yuv_to_rgb_sc.h index aee4bbdabde4f5dfd727d81c50047543b9da3cbb..e068840198c86624e6274043eeebfeb004cc2090 100644 --- a/kleidicv/src/conversions/yuv_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_to_rgb_sc.h @@ -25,12 +25,12 @@ class YUVToRGB : public UnrollOnce { typename std::conditional::type; // Returns the number of channels in the output image. - static constexpr size_t output_channels() KLEIDICV_STREAMING_COMPATIBLE { + static constexpr size_t output_channels() KLEIDICV_STREAMING { return ALPHA ? /* RGBA */ 4 : /* RGB */ 3; } void vector_path(ContextType ctx, const ScalarType *src, - ScalarType *dst) KLEIDICV_STREAMING_COMPATIBLE { + ScalarType *dst) KLEIDICV_STREAMING { auto pg = ctx.predicate(); Vector3Type svsrc = svld3(pg, src); svint16_t y_0 = svreinterpret_s16_u16(svshllb_n_u16(svget3(svsrc, 0), 0)); @@ -140,10 +140,11 @@ class YUVToRGB : public UnrollOnce { }; // end of class YUVToRGB template -kleidicv_error_t yuv2rgb_operation( - OperationType operation, const ScalarType *src, size_t src_stride, - ScalarType *dst, size_t dst_stride, size_t width, - size_t height) KLEIDICV_STREAMING_COMPATIBLE { +kleidicv_error_t yuv2rgb_operation(OperationType operation, + const ScalarType *src, size_t src_stride, + ScalarType *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -157,36 +158,40 @@ kleidicv_error_t yuv2rgb_operation( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t yuv_to_rgb_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t yuv_to_rgb_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { YUVToRGB operation; return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, height); } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t yuv_to_rgba_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t yuv_to_rgba_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { YUVToRGB operation; return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, height); } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t yuv_to_bgr_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t yuv_to_bgr_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { YUVToRGB operation; return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, height); } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t yuv_to_bgra_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height) KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t yuv_to_bgra_u8_sc(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, + size_t height) KLEIDICV_STREAMING { YUVToRGB operation; return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, height); diff --git a/kleidicv/src/filters/blur_and_downsample_sc.h b/kleidicv/src/filters/blur_and_downsample_sc.h index 2470cf1484c0bb2e34ba440705e04a1f83abf1f6..a8a99300268e3d5c8338ff8a5124ff4cc5f4fc98 100644 --- a/kleidicv/src/filters/blur_and_downsample_sc.h +++ b/kleidicv/src/filters/blur_and_downsample_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -38,25 +38,24 @@ class BlurAndDownsample { static constexpr size_t margin = 2UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets) const KLEIDICV_STREAMING { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - loop.unroll_twice([&](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](ptrdiff_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING { svbool_t pg_all = SourceVecTraits::svptrue(); vertical_vector_path_1x(pg_all, src_rows, dst_rows, border_offsets, index); }); - loop.remaining([&](ptrdiff_t index, - ptrdiff_t length) KLEIDICV_STREAMING_COMPATIBLE { + loop.remaining([&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING { svbool_t pg = SourceVecTraits::svwhilelt(index, length); vertical_vector_path_1x(pg, src_rows, dst_rows, border_offsets, index); }); @@ -65,17 +64,16 @@ class BlurAndDownsample { void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { svbool_t pg_all = BufferVecTraits::svptrue(); LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; - loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { horizontal_vector_path_2x(pg_all, pg_all, src_rows, pg_all, dst_rows, border_offsets, static_cast(index)); }); - loop.remaining([&](size_t index, - size_t length) KLEIDICV_STREAMING_COMPATIBLE { + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { svbool_t pg_src_0 = BufferVecTraits::svwhilelt(index, length); svbool_t pg_src_1 = BufferVecTraits::svwhilelt( index + BufferVecTraits::num_lanes(), length); @@ -88,7 +86,7 @@ class BlurAndDownsample { void process_horizontal_borders( Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { + BorderOffsets border_offsets) const KLEIDICV_STREAMING { for (ptrdiff_t index = 0; index < static_cast(src_rows.channels()); ++index) { disable_loop_vectorization(); @@ -98,10 +96,10 @@ class BlurAndDownsample { } private: - void vertical_vector_path_2x( - svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - ptrdiff_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path_2x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + ptrdiff_t index) const KLEIDICV_STREAMING { const auto *src_row_0 = &src_rows.at(border_offsets.c0())[index]; const auto *src_row_1 = &src_rows.at(border_offsets.c1())[index]; const auto *src_row_2 = &src_rows.at(border_offsets.c2())[index]; @@ -134,10 +132,10 @@ class BlurAndDownsample { SourceVecTraits::num_lanes())]); } - void vertical_vector_path_1x( - svbool_t pg, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets, - ptrdiff_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path_1x(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + ptrdiff_t index) const KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(border_offsets.c0())[index]); SourceVectorType src_1 = @@ -157,8 +155,7 @@ class BlurAndDownsample { // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, - BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { + BufferType *dst) const KLEIDICV_STREAMING { svuint16_t acc_0_4_b = svaddlb_u16(src_0, src_4); svuint16_t acc_0_4_t = svaddlt_u16(src_0, src_4); svuint16_t acc_1_3_b = svaddlb_u16(src_1, src_3); @@ -173,11 +170,12 @@ class BlurAndDownsample { svst2(pg, &dst[0], interleaved); } - void horizontal_vector_path_2x( - svbool_t pg_src_0, svbool_t pg_src_1, Rows src_rows, - svbool_t pg_dst, Rows dst_rows, - BorderOffsets border_offsets, - ptrdiff_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path_2x(svbool_t pg_src_0, svbool_t pg_src_1, + Rows src_rows, + svbool_t pg_dst, + Rows dst_rows, + BorderOffsets border_offsets, + ptrdiff_t index) const KLEIDICV_STREAMING { const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index]; const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index]; const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index]; @@ -207,9 +205,10 @@ class BlurAndDownsample { // Applies horizontal filtering vector using SIMD operations. // // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - svuint16_t horizontal_vector_path( - svbool_t pg, svuint16_t src_0, svuint16_t src_1, svuint16_t src_2, - svuint16_t src_3, svuint16_t src_4) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t horizontal_vector_path(svbool_t pg, svuint16_t src_0, + svuint16_t src_1, svuint16_t src_2, + svuint16_t src_3, + svuint16_t src_4) const KLEIDICV_STREAMING { svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4); svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3); svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6); @@ -221,10 +220,10 @@ class BlurAndDownsample { // Applies horizontal filtering for the borders using SIMD operations. // // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void horizontal_border_path( - svbool_t pg, Rows src_rows, - Rows dst_rows, BorderOffsets border_offsets, - ptrdiff_t index) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_border_path(svbool_t pg, Rows src_rows, + Rows dst_rows, + BorderOffsets border_offsets, + ptrdiff_t index) const KLEIDICV_STREAMING { BufferVectorType src_0 = svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); BufferVectorType src_1 = @@ -251,7 +250,7 @@ class BlurAndDownsample { static kleidicv_error_t blur_and_downsample_checks( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t channels, - BlurAndDownsampleFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + BlurAndDownsampleFilterWorkspace *workspace) KLEIDICV_STREAMING { CHECK_POINTERS(workspace); CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2); @@ -277,7 +276,7 @@ static kleidicv_error_t blur_and_downsample_stripe_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end, size_t channels, FixedBorderType fixed_border_type, - kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_filter_context_t *context) KLEIDICV_STREAMING { // Does not include checks for whether the operation is implemented. // This must be done earlier, by blur_and_downsample_is_implemented. auto *workspace = diff --git a/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp b/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp index a0813395fb011119cae334f52d090b92927381b4..31a6d5b8f8f67f5ea8e2ee64f43d0824613ad7a5 100644 --- a/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp @@ -76,7 +76,7 @@ class GaussianBlurArbitrary { void process_arbitrary_horizontal( size_t width, size_t kernel_size, Rows buffer_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { size_t x = 0; // Assume that there is always a widening when calculating, so the // horizontal vector path processes double-width vectors diff --git a/kleidicv/src/filters/gaussian_blur_fixed_sc.h b/kleidicv/src/filters/gaussian_blur_fixed_sc.h index cec5a9fe0b32252cbd428b485133dac927690ae5..2971058d7bf32db57a36c507814ca4412c926c61 100644 --- a/kleidicv/src/filters/gaussian_blur_fixed_sc.h +++ b/kleidicv/src/filters/gaussian_blur_fixed_sc.h @@ -42,9 +42,9 @@ class GaussianBlur { // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[3], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[3], + BufferType *dst) const KLEIDICV_STREAMING { svuint16_t acc_0_2_b = svaddlb_u16(src[0], src[2]); svuint16_t acc_0_2_t = svaddlt_u16(src[0], src[2]); @@ -61,9 +61,9 @@ class GaussianBlur { // Applies horizontal filtering vector using SIMD operations. // // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[3], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[3], + DestinationType *dst) const KLEIDICV_STREAMING { svuint16_t acc_0_2 = svhadd_u16_x(pg, src[0], src[2]); svuint16_t acc = svadd_u16_x(pg, acc_0_2, src[1]); @@ -75,8 +75,8 @@ class GaussianBlur { // Applies horizontal filtering vector using scalar operations. // // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[3], + DestinationType *dst) const KLEIDICV_STREAMING { auto acc = src[0] + 2 * src[1] + src[2]; dst[0] = rounding_shift_right(acc, 4); } @@ -99,9 +99,9 @@ class GaussianBlur { // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[5], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[5], + BufferType *dst) const KLEIDICV_STREAMING { svuint16_t acc_0_4_b = svaddlb_u16(src[0], src[4]); svuint16_t acc_0_4_t = svaddlt_u16(src[0], src[4]); svuint16_t acc_1_3_b = svaddlb_u16(src[1], src[3]); @@ -119,9 +119,9 @@ class GaussianBlur { // Applies horizontal filtering vector using SIMD operations. // // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[5], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[5], + DestinationType *dst) const KLEIDICV_STREAMING { svuint16_t acc_0_4 = svadd_x(pg, src[0], src[4]); svuint16_t acc_1_3 = svadd_x(pg, src[1], src[3]); svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src[2], 6); @@ -133,8 +133,8 @@ class GaussianBlur { // Applies horizontal filtering vector using scalar operations. // // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[5], + DestinationType *dst) const KLEIDICV_STREAMING { auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2]; dst[0] = rounding_shift_right(acc, 8); } @@ -168,9 +168,9 @@ class GaussianBlur { // // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * // * [ 2, 7, 14, 18, 14, 7, 2 ]T - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[7], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[7], + BufferType *dst) const KLEIDICV_STREAMING { svuint16_t acc_0_6_b = svaddlb_u16(src[0], src[6]); svuint16_t acc_0_6_t = svaddlt_u16(src[0], src[6]); @@ -205,9 +205,9 @@ class GaussianBlur { // // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * // * [ 2, 7, 14, 18, 14, 7, 2 ]T - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[7], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[7], + DestinationType *dst) const KLEIDICV_STREAMING { svuint32_t acc_0_6_b = svaddlb_u32(src[0], src[6]); svuint32_t acc_0_6_t = svaddlt_u32(src[0], src[6]); @@ -242,8 +242,8 @@ class GaussianBlur { // // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * // * [ 2, 7, 14, 18, 14, 7, 2 ]T - void horizontal_scalar_path(const BufferType src[7], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[7], + DestinationType *dst) const KLEIDICV_STREAMING { uint32_t acc = src[0] * 2 + src[1] * 7 + src[2] * 14 + src[3] * 18 + src[4] * 14 + src[5] * 7 + src[6] * 2; dst[0] = rounding_shift_right(acc, 12); @@ -268,12 +268,12 @@ class GaussianBlur { void vertical_vector_path( svbool_t pg, std::reference_wrapper src[KernelSize], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType *dst) const KLEIDICV_STREAMING { common_vector_path(pg, src, dst); } - void vertical_scalar_path(const SourceType src[KernelSize], BufferType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_scalar_path(const SourceType src[KernelSize], + BufferType *dst) const KLEIDICV_STREAMING { uint32_t acc = static_cast(src[kHalfKernelSize - 1]) * half_kernel_[kHalfKernelSize - 1]; @@ -290,20 +290,19 @@ class GaussianBlur { void horizontal_vector_path( svbool_t pg, std::reference_wrapper src[KernelSize], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType *dst) const KLEIDICV_STREAMING { common_vector_path(pg, src, dst); } void horizontal_scalar_path(const BufferType src[KernelSize], - DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { + DestinationType *dst) const KLEIDICV_STREAMING { vertical_scalar_path(src, dst); } private: void common_vector_path( svbool_t pg, std::reference_wrapper src[KernelSize], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + BufferType *dst) const KLEIDICV_STREAMING { svbool_t pg16_all = svptrue_b16(); svuint16_t acc_b = svmullb_n_u16(src[kHalfKernelSize - 1], half_kernel_[kHalfKernelSize - 1]); @@ -338,7 +337,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( const ScalarType *src, size_t src_stride, ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end, size_t channels, float sigma, FixedBorderType border_type, - SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { using GaussianBlurFilter = GaussianBlur; Rows src_rows{src, src_stride, channels}; @@ -385,7 +384,7 @@ static kleidicv_error_t gaussian_blur( size_t kernel_size, const ScalarType *src, size_t src_stride, ScalarType *dst, size_t dst_stride, Rectangle &rect, size_t y_begin, size_t y_end, size_t channels, float sigma, FixedBorderType border_type, - SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { switch (kernel_size) { case 3: return gaussian_blur_fixed_kernel_size<3, IsBinomial>( @@ -423,7 +422,7 @@ static kleidicv_error_t gaussian_blur_fixed_stripe_u8_sc( size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, size_t /*kernel_height*/, float sigma_x, float /*sigma_y*/, FixedBorderType fixed_border_type, - kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_filter_context_t *context) KLEIDICV_STREAMING { auto *workspace = reinterpret_cast(context); kleidicv_error_t checks_result = gaussian_blur_checks( src, src_stride, dst, dst_stride, width, height, channels, workspace); diff --git a/kleidicv/src/filters/median_blur_sorting_network_3x3.h b/kleidicv/src/filters/median_blur_sorting_network_3x3.h index d7381cd9fa773a19ef284a0d07dfcf561af64467..82fa234e2745fc9efe6a8f41faf084736da2f510 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_3x3.h +++ b/kleidicv/src/filters/median_blur_sorting_network_3x3.h @@ -14,8 +14,8 @@ namespace KLEIDICV_TARGET_NAMESPACE { template void sorting_network3x3_single_row(KernelWindowFunctor& KernelWindow, - T& output_vec, ContextType& context) - KLEIDICV_STREAMING_COMPATIBLE { + T& output_vec, + ContextType& context) KLEIDICV_STREAMING { // full sort row Comparator::compare_and_swap(KernelWindow(0, 0), KernelWindow(0, 2), context); Comparator::compare_and_swap(KernelWindow(0, 0), KernelWindow(0, 1), context); @@ -50,9 +50,9 @@ void sorting_network3x3_single_row(KernelWindowFunctor& KernelWindow, template -void sorting_network3x3_dual_rows( - KernelWindowFunctor& KernelWindow, T& output_vec_0, T& output_vec_1, - ContextType& context) KLEIDICV_STREAMING_COMPATIBLE { +void sorting_network3x3_dual_rows(KernelWindowFunctor& KernelWindow, + T& output_vec_0, T& output_vec_1, + ContextType& context) KLEIDICV_STREAMING { // full sort row Comparator::compare_and_swap(KernelWindow(0, 0), KernelWindow(0, 2), context); Comparator::compare_and_swap(KernelWindow(0, 0), KernelWindow(0, 1), context); diff --git a/kleidicv/src/filters/median_blur_sorting_network_5x5.h b/kleidicv/src/filters/median_blur_sorting_network_5x5.h index 9581b931a120549eb4198e450f6e796e29c44315..b4781f61a2a850bff1de7f5ae2e9d487f2a90447 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_5x5.h +++ b/kleidicv/src/filters/median_blur_sorting_network_5x5.h @@ -19,7 +19,7 @@ namespace KLEIDICV_TARGET_NAMESPACE { template void sorting_network5x5(KernelWindowFunctor& KernelWindow, T& output_vec, - ContextType& context) KLEIDICV_STREAMING_COMPATIBLE { + ContextType& context) KLEIDICV_STREAMING { Comparator::compare_and_swap(KernelWindow(3, 0), KernelWindow(0, 0), context); Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(1, 0), context); Comparator::compare_and_swap(KernelWindow(2, 0), KernelWindow(0, 0), context); diff --git a/kleidicv/src/filters/median_blur_sorting_network_7x7.h b/kleidicv/src/filters/median_blur_sorting_network_7x7.h index 863363ba5b919adc3c89e94bf0263b3d89374266..27aacdbd693f6997efc1ac233a5bffae5e4ecc69 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_7x7.h +++ b/kleidicv/src/filters/median_blur_sorting_network_7x7.h @@ -16,7 +16,7 @@ namespace KLEIDICV_TARGET_NAMESPACE { template void sorting_network7x7(KernelWindowFunctor& KernelWindow, T& output_vec, - ContextType& context) KLEIDICV_STREAMING_COMPATIBLE { + ContextType& context) KLEIDICV_STREAMING { Comparator::compare_and_swap(KernelWindow(0, 0), KernelWindow(6, 0), context); Comparator::compare_and_swap(KernelWindow(2, 0), KernelWindow(3, 0), context); Comparator::compare_and_swap(KernelWindow(4, 0), KernelWindow(5, 0), context); diff --git a/kleidicv/src/filters/median_blur_sorting_network_sc.h b/kleidicv/src/filters/median_blur_sorting_network_sc.h index f8496b3e8df1bccc92be3a9965036120844bcae6..3291fb3e333bca4f3b99d6cb848eadcfc2d915c1 100644 --- a/kleidicv/src/filters/median_blur_sorting_network_sc.h +++ b/kleidicv/src/filters/median_blur_sorting_network_sc.h @@ -31,7 +31,7 @@ class VectorComparator { using SourceVectorType = typename VecTraits::VectorType; static void compare_and_swap(SourceVectorType& left, SourceVectorType& right, - svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t& pg) KLEIDICV_STREAMING { SourceVectorType max_value = svmax_m(pg, left, right); SourceVectorType min_value = svmin_m(pg, left, right); left = min_value; @@ -39,22 +39,22 @@ class VectorComparator { } static void min(SourceVectorType& left, SourceVectorType& right, - svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t& pg) KLEIDICV_STREAMING { left = svmin_m(pg, left, right); } static void max(SourceVectorType& left, SourceVectorType& right, - svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t& pg) KLEIDICV_STREAMING { right = svmax_m(pg, left, right); } static SourceVectorType get_min(SourceVectorType& left, SourceVectorType& right, - svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t& pg) KLEIDICV_STREAMING { return svmin_m(pg, left, right); } static SourceVectorType get_max(SourceVectorType& left, SourceVectorType& right, - svbool_t& pg) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t& pg) KLEIDICV_STREAMING { return svmax_m(pg, left, right); } }; @@ -70,8 +70,7 @@ class MedianBlurSortingNetwork { template void vector_path(svbool_t& pg, KernelWindowFunctor& KernelWindow, - DestinationVectorType& output_vec) const - KLEIDICV_STREAMING_COMPATIBLE { + DestinationVectorType& output_vec) const KLEIDICV_STREAMING { sorting_network3x3_single_row>(KernelWindow, output_vec, pg); } @@ -80,7 +79,7 @@ class MedianBlurSortingNetwork { void vector_path_for_dual_row_handling( svbool_t& pg, KernelWindowFunctor& KernelWindow, DestinationVectorType& output_vec_0, - DestinationVectorType& output_vec_1) const KLEIDICV_STREAMING_COMPATIBLE { + DestinationVectorType& output_vec_1) const KLEIDICV_STREAMING { sorting_network3x3_dual_rows>( KernelWindow, output_vec_0, output_vec_1, pg); } @@ -99,8 +98,7 @@ class MedianBlurSortingNetwork { DestinationType>::VectorType; template void vector_path(svbool_t& pg, KernelWindowFunctor& KernelWindow, - DestinationVectorType& output_vec) const - KLEIDICV_STREAMING_COMPATIBLE { + DestinationVectorType& output_vec) const KLEIDICV_STREAMING { sorting_network5x5>(KernelWindow, output_vec, pg); } @@ -120,8 +118,7 @@ class MedianBlurSortingNetwork { template void vector_path(svbool_t& pg, KernelWindowFunctor& KernelWindow, - DestinationVectorType& output_vec) const - KLEIDICV_STREAMING_COMPATIBLE { + DestinationVectorType& output_vec) const KLEIDICV_STREAMING { sorting_network7x7>(KernelWindow, output_vec, pg); } @@ -132,7 +129,7 @@ kleidicv_error_t median_blur_sorting_network_stripe_sc( const T* src, size_t src_stride, T* dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, size_t kernel_width, [[maybe_unused]] size_t kernel_height, - FixedBorderType border_type) KLEIDICV_STREAMING_COMPATIBLE { + FixedBorderType border_type) KLEIDICV_STREAMING { Rectangle rect{width, height}; Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; diff --git a/kleidicv/src/filters/scharr_sc.h b/kleidicv/src/filters/scharr_sc.h index 2fdeaef9180aacce7e0ff230a694a3de713244db..6f2ba655d96ac0a4cae1854e9a73a741a3d97d46 100644 --- a/kleidicv/src/filters/scharr_sc.h +++ b/kleidicv/src/filters/scharr_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -44,13 +44,13 @@ class ScharrInterleaved { public: ScharrInterleaved(Rows hori_deriv_buffer, Rows vert_deriv_buffer, - size_t width) KLEIDICV_STREAMING_COMPATIBLE + size_t width) KLEIDICV_STREAMING : hori_deriv_buffer_(hori_deriv_buffer), vert_deriv_buffer_(vert_deriv_buffer), width_(width) {} void process(Rows src_rows, Rows dst_rows, - size_t y_begin, size_t y_end) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end) KLEIDICV_STREAMING { for (size_t i = y_begin; i < y_end; ++i) { process_vertical(src_rows.at(static_cast(i))); process_horizontal(dst_rows.at(static_cast(i))); @@ -59,7 +59,7 @@ class ScharrInterleaved { private: void vertical_vector_path(svbool_t pg, Rows src_rows, - ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t index) KLEIDICV_STREAMING { SourceVectorType src_0 = svld1(pg, &src_rows.at(0)[index]); SourceVectorType src_1 = svld1(pg, &src_rows.at(1)[index]); SourceVectorType src_2 = svld1(pg, &src_rows.at(2)[index]); @@ -87,25 +87,23 @@ class ScharrInterleaved { svst2(pg, &vert_deriv_buffer_[index], vert_interleaved); } - void process_vertical(Rows src_rows) - KLEIDICV_STREAMING_COMPATIBLE { + void process_vertical(Rows src_rows) KLEIDICV_STREAMING { LoopUnroll2 loop{width_ * src_rows.channels(), SourceVecTraits::num_lanes()}; svbool_t pg_all = SourceVecTraits::svptrue(); - loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING { vertical_vector_path(pg_all, src_rows, index); }); - loop.remaining( - [&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, index); - }); + loop.remaining([&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, index); + }); } void horizontal_vector_path(svbool_t pg, Rows dst_rows, - ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + ptrdiff_t index) KLEIDICV_STREAMING { // Horizontal derivative approximation BufferVectorType hori_buff_0 = svld1(pg, &hori_deriv_buffer_[index]); BufferVectorType hori_buff_2 = svld1(pg, &hori_deriv_buffer_[index + 2]); @@ -126,22 +124,20 @@ class ScharrInterleaved { svst2(pg, &dst_rows.at(0, index)[0], interleaved_result); } - void process_horizontal(Rows dst_rows) - KLEIDICV_STREAMING_COMPATIBLE { + void process_horizontal(Rows dst_rows) KLEIDICV_STREAMING { // width is decremented by 2 as the result has less columns. LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(), BufferVecTraits::num_lanes()}; svbool_t pg_all = BufferVecTraits::svptrue(); - loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING { horizontal_vector_path(pg_all, dst_rows, index); }); - loop.remaining( - [&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, dst_rows, index); - }); + loop.remaining([&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, dst_rows, index); + }); } Rows hori_deriv_buffer_; @@ -157,7 +153,7 @@ class ScharrBufferDeleter { static kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, - size_t y_end) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_end) KLEIDICV_STREAMING { // Does not include checks for whether the operation is implemented. // This must be done earlier, by scharr_interleaved_is_implemented. CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index ceb125add32931c6eddf9143699090c3dee4ba5c..02f0c6fb6be809d31d8b081ec32d37037c1dba93 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -47,9 +47,9 @@ class SeparableFilter2D { kernel_y_3_u8_(kernel_y_3_u8), kernel_y_4_u8_(kernel_y_4_u8) {} - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[5], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[5], + BufferType *dst) const KLEIDICV_STREAMING { // 0 BufferVectorType acc_b = svmullb_u16(src[0], kernel_y_0_u8_); BufferVectorType acc_t = svmullt_u16(src[0], kernel_y_0_u8_); @@ -82,9 +82,9 @@ class SeparableFilter2D { svst2(pg, &dst[0], interleaved); } - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[5], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[5], + DestinationType *dst) const KLEIDICV_STREAMING { // 0 svuint32_t acc_b = svmullb_u32(src[0], kernel_x_0_u16_); svuint32_t acc_t = svmullt_u32(src[0], kernel_x_0_u16_); @@ -116,8 +116,8 @@ class SeparableFilter2D { svst1b_u16(pg, &dst[0], acc_u16); } - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[5], + DestinationType *dst) const KLEIDICV_STREAMING { SourceType acc; // NOLINT if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { dst[0] = std::numeric_limits::max(); @@ -185,9 +185,9 @@ class SeparableFilter2D { kernel_y_3_u16_(kernel_y_3_u16), kernel_y_4_u16_(kernel_y_4_u16) {} - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[5], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[5], + BufferType *dst) const KLEIDICV_STREAMING { // 0 BufferVectorType acc_b = svmullb_u32(src[0], kernel_y_0_u16_); BufferVectorType acc_t = svmullt_u32(src[0], kernel_y_0_u16_); @@ -220,9 +220,9 @@ class SeparableFilter2D { svst2(pg, &dst[0], interleaved); } - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[5], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[5], + DestinationType *dst) const KLEIDICV_STREAMING { // 0 svuint64_t acc_b = svmullb_u64(src[0], kernel_x_0_u32_); svuint64_t acc_t = svmullt_u64(src[0], kernel_x_0_u32_); @@ -254,8 +254,8 @@ class SeparableFilter2D { svst1h_u32(pg, &dst[0], acc_u32); } - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[5], + DestinationType *dst) const KLEIDICV_STREAMING { SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { dst[0] = std::numeric_limits::max(); @@ -323,9 +323,9 @@ class SeparableFilter2D { kernel_y_3_s16_(kernel_y_3_s16), kernel_y_4_s16_(kernel_y_4_s16) {} - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[5], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[5], + BufferType *dst) const KLEIDICV_STREAMING { // 0 BufferVectorType acc_b = svmullb_s32(src[0], kernel_y_0_s16_); BufferVectorType acc_t = svmullt_s32(src[0], kernel_y_0_s16_); @@ -358,9 +358,9 @@ class SeparableFilter2D { svst2(pg, &dst[0], interleaved); } - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[5], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[5], + DestinationType *dst) const KLEIDICV_STREAMING { // 0 svint64_t acc_b = svmullb_s64(src[0], kernel_x_0_s32_); svint64_t acc_t = svmullt_s64(src[0], kernel_x_0_s32_); @@ -397,8 +397,8 @@ class SeparableFilter2D { svst1h_s32(pg, &dst[0], acc_s32); } - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[5], + DestinationType *dst) const KLEIDICV_STREAMING { int64_t acc = static_cast(src[0]) * kernel_x_[0]; for (size_t i = 1; i < 5; i++) { acc += static_cast(src[i]) * kernel_x_[i]; @@ -433,7 +433,7 @@ template static kleidicv_error_t separable_filter_2d_checks( const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, size_t height, size_t channels, const T *kernel_x, const T *kernel_y, - SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { CHECK_POINTERS(workspace, kernel_x, kernel_y); CHECK_POINTER_AND_STRIDE(src, src_stride, height); @@ -462,7 +462,7 @@ kleidicv_error_t separable_filter_2d_stripe_sc( size_t height, size_t y_begin, size_t y_end, size_t channels, const T *kernel_x, size_t /*kernel_width*/, const T *kernel_y, size_t /*kernel_height*/, FixedBorderType fixed_border_type, - kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + kleidicv_filter_context_t *context) KLEIDICV_STREAMING { auto *workspace = reinterpret_cast(context); kleidicv_error_t checks_result = separable_filter_2d_checks( src, src_stride, dst, dst_stride, width, height, channels, kernel_x, diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index fc0a5d08f9b8fd8036b7c657c76fd491063c7431..e018f9a63e05b089ebde7fa51f370aef6b4e237b 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -34,9 +34,9 @@ class HorizontalSobel3x3 { // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[3], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[3], + BufferType *dst) const KLEIDICV_STREAMING { svuint16_t acc_u16_b = svaddlb(src[0], src[2]); svuint16_t acc_u16_t = svaddlt(src[0], src[2]); acc_u16_b = svmlalb(acc_u16_b, src[1], svdup_n_u8(2)); @@ -50,17 +50,17 @@ class HorizontalSobel3x3 { // Applies horizontal filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[3], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[3], + DestinationType *dst) const KLEIDICV_STREAMING { svst1(pg, &dst[0], svsub_x(pg, src[2], src[0])); } // Applies horizontal filtering vector using scalar operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[3], + DestinationType *dst) const KLEIDICV_STREAMING { // Explicitly narrow. Overflow is permitted. dst[0] = static_cast(src[2] - src[0]); } @@ -87,9 +87,9 @@ class VerticalSobel3x3 { // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T - void vertical_vector_path( - svbool_t pg, std::reference_wrapper src[3], - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void vertical_vector_path(svbool_t pg, + std::reference_wrapper src[3], + BufferType *dst) const KLEIDICV_STREAMING { svuint16_t acc_u16_b = svsublb(src[2], src[0]); svuint16_t acc_u16_t = svsublt(src[2], src[0]); @@ -101,9 +101,9 @@ class VerticalSobel3x3 { // Applies horizontal filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_vector_path( - svbool_t pg, std::reference_wrapper src[3], - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_vector_path(svbool_t pg, + std::reference_wrapper src[3], + DestinationType *dst) const KLEIDICV_STREAMING { svint16_t acc = svadd_x(pg, src[0], src[2]); acc = svmad_s16_x(pg, src[1], svdup_n_s16(2), acc); svst1(pg, &dst[0], acc); @@ -112,8 +112,8 @@ class VerticalSobel3x3 { // Applies horizontal filtering vector using scalar operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { + void horizontal_scalar_path(const BufferType src[3], + DestinationType *dst) const KLEIDICV_STREAMING { // Explicitly narrow. Overflow is permitted. dst[0] = static_cast(src[0] + 2 * src[1] + src[2]); } @@ -123,7 +123,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t sobel_3x3_horizontal_stripe_s16_u8_sc( const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE { + size_t channels) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); @@ -153,7 +153,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t sobel_3x3_vertical_stripe_s16_u8_sc( const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, size_t width, size_t height, size_t y_begin, size_t y_end, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE { + size_t channels) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/logical/bitwise_and_sc.h b/kleidicv/src/logical/bitwise_and_sc.h index 86ccf588bfdd04e148d3158abf8b86c2ebce58ac..773620eb807a0c60c76a43304be46e71ce6f2335 100644 --- a/kleidicv/src/logical/bitwise_and_sc.h +++ b/kleidicv/src/logical/bitwise_and_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -18,7 +18,7 @@ class BitwiseAnd final : public UnrollTwice { using VectorType = typename VecTraits::VectorType; VectorType vector_path(ContextType ctx, VectorType src_a, - VectorType src_b) KLEIDICV_STREAMING_COMPATIBLE { + VectorType src_b) KLEIDICV_STREAMING { return svand_x(ctx.predicate(), src_a, src_b); } }; // end of class BitwiseAnd @@ -27,7 +27,7 @@ template kleidicv_error_t bitwise_and_sc(const T *src_a, size_t src_a_stride, const T *src_b, size_t src_b_stride, T *dst, size_t dst_stride, size_t width, - size_t height) KLEIDICV_STREAMING_COMPATIBLE { + size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src_a, src_a_stride, height); CHECK_POINTER_AND_STRIDE(src_b, src_b_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); diff --git a/kleidicv/src/morphology/morphology_sc.h b/kleidicv/src/morphology/morphology_sc.h index 2a1021abc68e9065604dab6f35051195c3626195..6d8797dc1739b2e0c0f055c72c8843bce103eaf4 100644 --- a/kleidicv/src/morphology/morphology_sc.h +++ b/kleidicv/src/morphology/morphology_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -23,15 +23,14 @@ class CopyDataSVE2 { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; - VectorType vector_path(ContextType, - VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + VectorType vector_path(ContextType, VectorType src) KLEIDICV_STREAMING { return src; } }; // end of class CopyOperation public: void operator()(Rows src_rows, Rows dst_rows, - size_t length) const KLEIDICV_STREAMING_COMPATIBLE { + size_t length) const KLEIDICV_STREAMING { // 'apply_operation_by_rows' can only handle one channel well // so width must be multiplied in order to copy all the data Rectangle rect{length * dst_rows.channels(), std::size_t{1}}; @@ -47,12 +46,12 @@ class VerticalOp final { public: using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - VerticalOp(Rectangle rect, Rectangle kernel) KLEIDICV_STREAMING_COMPATIBLE + VerticalOp(Rectangle rect, Rectangle kernel) KLEIDICV_STREAMING : rect_(rect), kernel_(kernel) {} void process_rows(IndirectRows src_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { if (KLEIDICV_UNLIKELY(kernel_.height()) == 1) { CopyRows::copy_rows(rect_, src_rows, dst_rows); return; @@ -65,13 +64,13 @@ class VerticalOp final { LoopUnroll2 loop{rect_.width() * src_rows.channels(), VecTraits::num_lanes()}; // clang-format off - loop.unroll_four_times([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_four_times([&](size_t index) KLEIDICV_STREAMING { vector_path_4x(src_rows, dst_rows, index, height); }) - .unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_twice([&](size_t index) KLEIDICV_STREAMING { vector_path_2x(src_rows, dst_rows, index, height); }) - .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { svbool_t pg = VecTraits::svwhilelt(index, length); while (svptest_first(VecTraits::svptrue(), pg)) { vector_path(pg, src_rows, dst_rows, index, height); @@ -88,7 +87,7 @@ class VerticalOp final { private: void vector_path_4x(IndirectRows src_rows, Rows dst_rows, const size_t index, - const size_t height) KLEIDICV_STREAMING_COMPATIBLE { + const size_t height) KLEIDICV_STREAMING { const ScalarType *src_row = &src_rows[index]; auto first_row0 = svld1(VecTraits::svptrue(), &src_row[0]); auto first_row1 = svld1_vnum(VecTraits::svptrue(), &src_row[0], 1); @@ -105,7 +104,7 @@ class VerticalOp final { LoopUnroll loop{kernel_.height() - 2, 2}; - loop.unroll_once([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t step) KLEIDICV_STREAMING { const ScalarType *src_row0 = &src_rows.at(0)[index]; const ScalarType *src_row1 = &src_rows.at(1)[index]; auto row00 = svld1(VecTraits::svptrue(), src_row0); @@ -128,7 +127,7 @@ class VerticalOp final { }); loop.tail([&](size_t /* index */) // NOLINT(readability/casting) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { const ScalarType *src_row = &src_rows[index]; auto row0 = svld1(VecTraits::svptrue(), &src_row[0]); auto row1 = svld1_vnum(VecTraits::svptrue(), &src_row[0], 1); @@ -188,7 +187,7 @@ class VerticalOp final { void vector_path_2x(IndirectRows src_rows, Rows dst_rows, const size_t index, - const size_t height) KLEIDICV_STREAMING_COMPATIBLE { + const size_t height) KLEIDICV_STREAMING { const ScalarType *src_row = &src_rows[index]; auto first_row0 = svld1(VecTraits::svptrue(), &src_row[0]); auto first_row1 = svld1_vnum(VecTraits::svptrue(), &src_row[0], 1); @@ -201,7 +200,7 @@ class VerticalOp final { LoopUnroll loop{kernel_.height() - 2, 2}; - loop.unroll_once([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t step) KLEIDICV_STREAMING { const ScalarType *src_row0 = &src_rows.at(0)[index]; const ScalarType *src_row1 = &src_rows.at(1)[index]; auto row00 = svld1(VecTraits::svptrue(), src_row0); @@ -216,7 +215,7 @@ class VerticalOp final { }); loop.tail([&](size_t /* index */) // NOLINT(readability/casting) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { const ScalarType *src_row = &src_rows[index]; auto row0 = svld1(VecTraits::svptrue(), &src_row[0]); auto row1 = svld1_vnum(VecTraits::svptrue(), &src_row[0], 1); @@ -259,7 +258,7 @@ class VerticalOp final { void vector_path(svbool_t pg, IndirectRows src_rows, Rows dst_rows, const size_t index, - const size_t height) KLEIDICV_STREAMING_COMPATIBLE { + const size_t height) KLEIDICV_STREAMING { auto first_row = svld1(pg, &src_rows[index]); ++src_rows; @@ -268,7 +267,7 @@ class VerticalOp final { LoopUnroll loop{kernel_.height() - 2, 2}; - loop.unroll_once([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_once([&](size_t step) KLEIDICV_STREAMING { auto row0 = svld1(pg, &src_rows.at(0)[index]); auto row1 = svld1(pg, &src_rows.at(1)[index]); acc = O::operation(pg, acc, O::operation(pg, row0, row1)); @@ -276,7 +275,7 @@ class VerticalOp final { }); loop.tail([&](size_t /* index */) // NOLINT(readability/casting) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { auto row = svld1(pg, &src_rows[index]); acc = O::operation(pg, acc, row); ++src_rows; @@ -312,25 +311,25 @@ class HorizontalOp final { public: using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; - HorizontalOp(Rectangle rect, Rectangle kernel) KLEIDICV_STREAMING_COMPATIBLE + HorizontalOp(Rectangle rect, Rectangle kernel) KLEIDICV_STREAMING : rect_(rect), kernel_(kernel) {} void process_rows(Rows src_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { // Iterate across the rows from top to bottom. for (size_t height = 0; height < rect_.height(); ++height) { // Iterate across the columns from left to right. LoopUnroll2 loop{rect_.width() * src_rows.channels(), VecTraits::num_lanes()}; // clang-format off - loop.unroll_four_times([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + loop.unroll_four_times([&](size_t index) KLEIDICV_STREAMING { vector_path_4x(src_rows, dst_rows, index); }) - .unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_twice([&](size_t index) KLEIDICV_STREAMING { vector_path_2x(src_rows, dst_rows, index); }) - .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { + .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { svbool_t pg = VecTraits::svwhilelt(index, length); while (svptest_first(VecTraits::svptrue(), pg)) { vector_path(pg, src_rows, dst_rows, index); @@ -347,7 +346,7 @@ class HorizontalOp final { private: void vector_path_4x(Rows src_rows, Rows dst_rows, - const size_t index) KLEIDICV_STREAMING_COMPATIBLE { + const size_t index) KLEIDICV_STREAMING { const auto *src_row = &src_rows[index]; auto acc0 = svld1(VecTraits::svptrue(), &src_row[0]); auto acc1 = svld1_vnum(VecTraits::svptrue(), &src_row[0], 1); @@ -375,7 +374,7 @@ class HorizontalOp final { void vector_path_2x(Rows src_rows, Rows dst_rows, - const size_t index) KLEIDICV_STREAMING_COMPATIBLE { + const size_t index) KLEIDICV_STREAMING { const auto *src_row = &src_rows[index]; auto acc0 = svld1(VecTraits::svptrue(), &src_row[0]); auto acc1 = svld1_vnum(VecTraits::svptrue(), &src_row[0], 1); @@ -395,7 +394,7 @@ class HorizontalOp final { void vector_path(svbool_t pg, Rows src_rows, Rows dst_rows, - const size_t index) KLEIDICV_STREAMING_COMPATIBLE { + const size_t index) KLEIDICV_STREAMING { auto acc = svld1(pg, &src_rows[index]); for (size_t width = 1; width < kernel_.width(); ++width) { @@ -417,7 +416,7 @@ class Min final { using VectorType = typename VecTraits::VectorType; static VectorType operation(svbool_t pg, VectorType lhs, - VectorType rhs) KLEIDICV_STREAMING_COMPATIBLE { + VectorType rhs) KLEIDICV_STREAMING { return svmin_x(pg, lhs, rhs); } }; // end of class Min @@ -429,7 +428,7 @@ class Max final { using VectorType = typename VecTraits::VectorType; static VectorType operation(svbool_t pg, VectorType lhs, - VectorType rhs) KLEIDICV_STREAMING_COMPATIBLE { + VectorType rhs) KLEIDICV_STREAMING { return svmax_x(pg, lhs, rhs); } }; // end of class Max @@ -452,18 +451,16 @@ class DilateOperation final { using DestinationType = ScalarType; using CopyData = CopyDataOperation; - explicit DilateOperation(Rectangle kernel) KLEIDICV_STREAMING_COMPATIBLE + explicit DilateOperation(Rectangle kernel) KLEIDICV_STREAMING : kernel_{kernel} {} void process_horizontal(Rectangle rect, Rows src_rows, - Rows dst_rows) - KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { HorizontalMax{rect, kernel_}.process_rows(src_rows, dst_rows); } void process_vertical(Rectangle rect, IndirectRows src_rows, - Rows dst_rows) - KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { VerticalMax{rect, kernel_}.process_rows(src_rows, dst_rows); } @@ -474,8 +471,7 @@ class DilateOperation final { template static kleidicv_error_t dilate_sc( const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, - size_t height, - kleidicv_morphology_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + size_t height, kleidicv_morphology_context_t *context) KLEIDICV_STREAMING { CHECK_POINTERS(context); CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); @@ -524,18 +520,16 @@ class ErodeOperation final { using DestinationType = ScalarType; using CopyData = CopyDataOperation; - explicit ErodeOperation(Rectangle kernel) KLEIDICV_STREAMING_COMPATIBLE + explicit ErodeOperation(Rectangle kernel) KLEIDICV_STREAMING : kernel_{kernel} {} void process_horizontal(Rectangle rect, Rows src_rows, - Rows dst_rows) - KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { HorizontalMin{rect, kernel_}.process_rows(src_rows, dst_rows); } void process_vertical(Rectangle rect, IndirectRows src_rows, - Rows dst_rows) - KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { VerticalMin{rect, kernel_}.process_rows(src_rows, dst_rows); } @@ -544,10 +538,9 @@ class ErodeOperation final { }; // end of class ErodeOperation template -static kleidicv_error_t erode_sc(const T *src, size_t src_stride, T *dst, - size_t dst_stride, size_t width, size_t height, - kleidicv_morphology_context_t *context) - KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t erode_sc( + const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, + size_t height, kleidicv_morphology_context_t *context) KLEIDICV_STREAMING { CHECK_POINTERS(context); CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index ac80fd2a7f5d6d4b92306a77243dd1171b53e7ae..8f4de64b9d607025980d353de63b278a48a554cb 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -15,12 +15,11 @@ namespace KLEIDICV_TARGET_NAMESPACE { KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, uint8_t *dst, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride) KLEIDICV_STREAMING { size_t dst_width = src_width * 2; size_t dst_height = src_height * 2; - auto lerp1d_vector = [](svuint8_t near, - svuint8_t far) KLEIDICV_STREAMING_COMPATIBLE { + auto lerp1d_vector = [](svuint8_t near, svuint8_t far) KLEIDICV_STREAMING { // near * 3 svuint16_t near3b = svmullb(near, uint8_t{3}); svuint16_t near3t = svmullt(near, uint8_t{3}); @@ -40,8 +39,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( }; auto lerp2d_vector = [](svbool_t pg, svuint8_t near, svuint8_t mid_a, - svuint8_t mid_b, - svuint8_t far) KLEIDICV_STREAMING_COMPATIBLE { + svuint8_t mid_b, svuint8_t far) KLEIDICV_STREAMING { // near * 9 svuint16_t near9b = svmullb(near, uint8_t{9}); svuint16_t near9t = svmullt(near, uint8_t{9}); @@ -69,12 +67,10 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( return near9_mid3_far_8_div16; }; - // Work-around for clang-format oddness. -#define KSC KLEIDICV_STREAMING_COMPATIBLE - // Handle top or bottom edge auto process_edge_row = [src_width, dst_width, lerp1d_vector]( - const uint8_t *src_row, uint8_t *dst_row) KSC { + const uint8_t *src_row, + uint8_t *dst_row) KLEIDICV_STREAMING { // Left element dst_row[0] = src_row[0]; @@ -99,7 +95,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const uint8_t *src_row0, const uint8_t *src_row1, uint8_t *dst_row0, - uint8_t *dst_row1) KLEIDICV_STREAMING_COMPATIBLE { + uint8_t *dst_row1) KLEIDICV_STREAMING { // Left elements svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read/write 1 element { @@ -164,30 +160,32 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, uint8_t *dst, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride) KLEIDICV_STREAMING { size_t dst_width = src_width * 4; size_t dst_height = src_height * 4; - auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b) KSC { - // bias - svuint16_t top = svdup_u16(4); + auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b) + KLEIDICV_STREAMING { + // bias + svuint16_t top = svdup_u16(4); - // bias + a * p - svuint16_t bot = svmlalb(top, a, p); - top = svmlalt(top, a, p); + // bias + a * p + svuint16_t bot = svmlalb(top, a, p); + top = svmlalt(top, a, p); - // bias + a * p + b * q - bot = svmlalb(bot, b, q); - top = svmlalt(top, b, q); + // bias + a * p + b * q + bot = svmlalb(bot, b, q); + top = svmlalt(top, b, q); - // (bias + a * p + b * q) / 8 - svuint8_t result = svshrnb(bot, 3ULL); - result = svshrnt(result, top, 3ULL); - return result; - }; + // (bias + a * p + b * q) / 8 + svuint8_t result = svshrnb(bot, 3ULL); + result = svshrnt(result, top, 3ULL); + return result; + }; auto lerp2d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b, - uint8_t r, svuint8_t c, uint8_t s, svuint8_t d) KSC { + uint8_t r, svuint8_t c, uint8_t s, + svuint8_t d) KLEIDICV_STREAMING { // bias svuint16_t top = svdup_u16(32); @@ -214,7 +212,8 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( // Handle top or bottom edge auto process_edge_row = [src_width, dst_width, lerp1d_vector]( - const uint8_t *src_row, uint8_t *dst_row) KSC { + const uint8_t *src_row, + uint8_t *dst_row) KLEIDICV_STREAMING { // Left elements dst_row[1] = dst_row[0] = src_row[0]; @@ -237,7 +236,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( const uint8_t *src_row0, const uint8_t *src_row1, uint8_t *dst_row0, uint8_t *dst_row1, uint8_t *dst_row2, - uint8_t *dst_row3) KLEIDICV_STREAMING_COMPATIBLE { + uint8_t *dst_row3) KLEIDICV_STREAMING { // Left elements svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read 1 element svbool_t pg2 = svptrue_pat_b8(SV_VL2); // write 2 elements @@ -294,7 +293,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( }; auto copy_dst_row = [src_width](const uint8_t *dst_from, - uint8_t *dst_to) KSC { + uint8_t *dst_to) KLEIDICV_STREAMING { for (size_t i = 0; i < src_width; i += svcntb()) { svbool_t pg = svwhilelt_b8(i, src_width); svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4)); @@ -334,19 +333,19 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, float *dst, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride) KLEIDICV_STREAMING { size_t dst_width = src_width * 2; src_stride /= sizeof(float); dst_stride /= sizeof(float); auto lerp1d_vector = [](svbool_t pg, svfloat32_t near, - svfloat32_t far) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t far) KLEIDICV_STREAMING { return svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.75F), far, 0.25F); }; auto lerp2d_vector = [](svbool_t pg, svfloat32_t near, svfloat32_t mid_a, svfloat32_t mid_b, - svfloat32_t far) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t far) KLEIDICV_STREAMING { return svmla_n_f32_x( pg, svmla_n_f32_x( @@ -356,12 +355,10 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( far, 0.0625F); }; - // Work-around for clang-format oddness. -#define KSC KLEIDICV_STREAMING_COMPATIBLE - // Handle top or bottom edge auto process_edge_row = [src_width, dst_width, lerp1d_vector]( - const float *src_row, float *dst_row) KSC { + const float *src_row, + float *dst_row) KLEIDICV_STREAMING { // Left element dst_row[0] = src_row[0]; @@ -384,7 +381,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const float *src_row0, const float *src_row1, - float *dst_row0, float *dst_row1) KSC { + float *dst_row0, float *dst_row1) KLEIDICV_STREAMING { // Left elements svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read/write 1 element { @@ -447,19 +444,20 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, float *dst, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride) KLEIDICV_STREAMING { size_t dst_width = src_width * 4; size_t dst_height = src_height * 4; src_stride /= sizeof(float); dst_stride /= sizeof(float); - auto lerp1d_vector = - [](svbool_t pg, float p, svfloat32_t a, float q, svfloat32_t b) - KSC { return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q); }; + auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q, + svfloat32_t b) KLEIDICV_STREAMING { + return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q); + }; auto lerp2d_vector = [](svbool_t pg, float p, svfloat32_t a, float q, svfloat32_t b, float r, svfloat32_t c, float s, - svfloat32_t d) KSC { + svfloat32_t d) KLEIDICV_STREAMING { return svmla_n_f32_x( pg, svmla_n_f32_x(pg, svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q), c, @@ -469,7 +467,8 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( // Handle top or bottom edge auto process_edge_row = [src_width, dst_width, dst_stride, lerp1d_vector]( - const float *src_row, float *dst_row) KSC { + const float *src_row, + float *dst_row) KLEIDICV_STREAMING { // Left elements dst_row[1] = dst_row[0] = dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0]; @@ -497,7 +496,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const float *src_row0, const float *src_row1, float *dst_row0, float *dst_row1, float *dst_row2, - float *dst_row3) KSC { + float *dst_row3) KLEIDICV_STREAMING { // Left elements svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element svbool_t pg2 = svptrue_pat_b32(SV_VL2); // write 2 elements @@ -603,7 +602,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, float *dst, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride) KLEIDICV_STREAMING { size_t dst_width = src_width * 8; size_t dst_height = src_height * 8; src_stride /= sizeof(float); @@ -619,13 +618,12 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( svfloat32_t coeffs_b1 = svld1(svptrue_b32(), &coeffs_b[4]); auto lerp1d_vector_n = [](svbool_t pg, float p, svfloat32_t a, float q, - svfloat32_t b) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t b) KLEIDICV_STREAMING { return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q); }; auto lerp1d_vector = [](svbool_t pg, svfloat32_t p, svfloat32_t a, - svfloat32_t q, - svfloat32_t b) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t q, svfloat32_t b) KLEIDICV_STREAMING { return svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q); }; @@ -633,7 +631,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( auto process_edge_row = [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { // Left elements float left = src_row[0]; float *dst = dst_row; @@ -691,7 +689,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( auto lerp2d_vector = [](svbool_t pg, svfloat32_t a, svfloat32_t p, svfloat32_t b, svfloat32_t q, svfloat32_t c, svfloat32_t r, svfloat32_t d, - svfloat32_t s) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t s) KLEIDICV_STREAMING { return svmla_f32_x( pg, svmla_f32_x(pg, svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q), c, r), d, s); @@ -699,10 +697,10 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n, &coeffs_p0, &coeffs_q0, &coeffs_r0, &coeffs_s0, &coeffs_p1, - &coeffs_q1, &coeffs_r1, &coeffs_s1]( - const float *src_row0, const float *src_row1, - float *dst_row0, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + &coeffs_q1, &coeffs_r1, + &coeffs_s1](const float *src_row0, const float *src_row1, + float *dst_row0, + size_t dst_stride) KLEIDICV_STREAMING { // Left elements svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements @@ -823,7 +821,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, float *dst, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_stride) KLEIDICV_STREAMING { size_t dst_width = src_width * 8; size_t dst_height = src_height * 8; src_stride /= sizeof(float); @@ -864,13 +862,15 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( coeffs_b = svdiv_n_f32_x(svptrue_b32(), repetitive_float, 16.0F); coeffs_a = svsub_x(svptrue_b32(), svdup_f32(1.0F), coeffs_b); } - auto lerp1d_vector = - [](svbool_t pg, float p, svfloat32_t a, float q, svfloat32_t b) - KSC { return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q); }; + auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q, + svfloat32_t b) KLEIDICV_STREAMING { + return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q); + }; auto index_and_lerp1d = [&coeffs_a, &coeffs_b]( svbool_t pg, svuint32_t indices_a, - svuint32_t indices_b, svfloat32_t src) KSC { + svuint32_t indices_b, + svfloat32_t src) KLEIDICV_STREAMING { return svmla_f32_x(pg, svmul_f32_x(pg, svtbl(src, indices_a), coeffs_a), svtbl(src, indices_b), coeffs_b); }; @@ -880,7 +880,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( &indices_0b, &indices_1a, &indices_1b, &indices_2a, &indices_2b, &indices_3a, &indices_3b](const float *src_row, float *dst_row, - size_t dst_stride) KSC { + size_t dst_stride) KLEIDICV_STREAMING { // Left elements float left = src_row[0]; float *dst = dst_row; @@ -952,7 +952,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( auto index_and_lerp2d = [&coeffs_p, &coeffs_q, &coeffs_r, &coeffs_s]( svbool_t pg, svuint32_t indices_a, svuint32_t indices_b, svfloat32_t src0, - svfloat32_t src1) KSC { + svfloat32_t src1) KLEIDICV_STREAMING { return svmla_f32_x( pg, svmla_f32_x( @@ -967,8 +967,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( &indices_0a, &indices_0b, &indices_1a, &indices_1b, &indices_2a, &indices_2b, &indices_3a, &indices_3b]( const float *src_row0, const float *src_row1, - float *dst_row, - size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + float *dst_row, size_t dst_stride) KLEIDICV_STREAMING { // Left edge svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements @@ -1113,7 +1112,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_width, size_t dst_height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(dst_width, dst_height); @@ -1139,7 +1138,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_u8_sc( KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_f32_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, size_t y_begin, size_t y_end, float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_width, size_t dst_height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(dst_width, dst_height); diff --git a/kleidicv/src/resize/resize_sc.h b/kleidicv/src/resize/resize_sc.h index b67d8bc6610b5fe020cfd965c5598954045c5afe..373fb83e2128bf55064d253f9ecb46f33c75ae72 100644 --- a/kleidicv/src/resize/resize_sc.h +++ b/kleidicv/src/resize/resize_sc.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -10,9 +10,8 @@ namespace KLEIDICV_TARGET_NAMESPACE { -static inline svuint8_t resize_parallel_vectors(svbool_t pg, svuint8_t top_row, - svuint8_t bottom_row) - KLEIDICV_STREAMING_COMPATIBLE { +static inline svuint8_t resize_parallel_vectors( + svbool_t pg, svuint8_t top_row, svuint8_t bottom_row) KLEIDICV_STREAMING { svuint16_t result_before_averaging_b = svaddlb(top_row, bottom_row); svuint16_t result_before_averaging_t = svaddlt(top_row, bottom_row); svuint16_t result_before_averaging = @@ -22,7 +21,7 @@ static inline svuint8_t resize_parallel_vectors(svbool_t pg, svuint8_t top_row, static inline void parallel_rows_vectors_path_2x( svbool_t pg, Rows src_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { svuint8_t top_row_0 = svld1(pg, &src_rows.at(0)[0]); svuint8_t bottom_row_0 = svld1(pg, &src_rows.at(1)[0]); svuint8_t top_row_1 = svld1_vnum(pg, &src_rows.at(0)[0], 1); @@ -39,7 +38,7 @@ static inline void parallel_rows_vectors_path_2x( static inline void parallel_rows_vectors_path( svbool_t pg, Rows src_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { svuint8_t top_line = svld1(pg, &src_rows.at(0)[0]); svuint8_t bottom_line = svld1(pg, &src_rows.at(1)[0]); svuint8_t result = resize_parallel_vectors(pg, top_line, bottom_line); @@ -47,32 +46,32 @@ static inline void parallel_rows_vectors_path( } template -static inline void process_parallel_rows( - Rows src_rows, size_t src_width, - Rows dst_rows, size_t dst_width) KLEIDICV_STREAMING_COMPATIBLE { +static inline void process_parallel_rows(Rows src_rows, + size_t src_width, + Rows dst_rows, + size_t dst_width) KLEIDICV_STREAMING { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; const size_t size_mask = ~static_cast(1U); // Process rows up to the last even pixel index. LoopUnroll2{src_width & size_mask, VecTraits::num_lanes()} // Process double vector chunks. - .unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_twice([&](size_t index) KLEIDICV_STREAMING { auto pg = VecTraits::svptrue(); parallel_rows_vectors_path_2x(pg, src_rows.at(0, index), dst_rows.at(0, index / 2)); }) - .unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_once([&](size_t index) KLEIDICV_STREAMING { auto pg = VecTraits::svptrue(); parallel_rows_vectors_path(pg, src_rows.at(0, index), dst_rows.at(0, index / 2)); }) // Process the remaining chunk of the row. - .remaining([&](size_t index, size_t length) - KLEIDICV_STREAMING_COMPATIBLE { - auto pg = VecTraits::svwhilelt(index, length); - parallel_rows_vectors_path(pg, src_rows.at(0, index), - dst_rows.at(0, index / 2)); - }); + .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + auto pg = VecTraits::svwhilelt(index, length); + parallel_rows_vectors_path(pg, src_rows.at(0, index), + dst_rows.at(0, index / 2)); + }); // Handle the last odd column, if any. if (dst_width > (src_width / 2)) { @@ -83,14 +82,14 @@ static inline void process_parallel_rows( } } -static inline svuint8_t resize_single_row(svbool_t pg, svuint8_t row) - KLEIDICV_STREAMING_COMPATIBLE { +static inline svuint8_t resize_single_row(svbool_t pg, + svuint8_t row) KLEIDICV_STREAMING { return svrshrnb(svadalp_x(pg, svdup_u16(0), row), 1); } static inline void single_row_vector_path_2x( svbool_t pg, Rows src_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { svuint8_t line0 = svld1(pg, &src_rows[0]); svuint8_t line1 = svld1_vnum(pg, &src_rows[0], 1); svuint8_t result0 = svrshrnb(svadalp_x(pg, svdup_u16(0), line0), 1); @@ -101,39 +100,39 @@ static inline void single_row_vector_path_2x( static inline void single_row_vector_path( svbool_t pg, Rows src_rows, - Rows dst_rows) KLEIDICV_STREAMING_COMPATIBLE { + Rows dst_rows) KLEIDICV_STREAMING { svuint8_t line = svld1(pg, &src_rows.at(0)[0]); svuint8_t result = svrshrnb(svadalp_x(pg, svdup_u16(0), line), 1); svst1b(pg, &dst_rows[0], svreinterpret_u16_u8(result)); } template -static inline void process_single_row( - Rows src_rows, size_t src_width, - Rows dst_rows, size_t dst_width) KLEIDICV_STREAMING_COMPATIBLE { +static inline void process_single_row(Rows src_rows, + size_t src_width, + Rows dst_rows, + size_t dst_width) KLEIDICV_STREAMING { using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; const size_t size_mask = ~static_cast(1U); // Process rows up to the last even pixel index. LoopUnroll2{src_width & size_mask, VecTraits::num_lanes()} // Process full vector chunks. - .unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_twice([&](size_t index) KLEIDICV_STREAMING { auto pg = VecTraits::svptrue(); single_row_vector_path_2x(pg, src_rows.at(0, index), dst_rows.at(0, index / 2)); }) - .unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { + .unroll_once([&](size_t index) KLEIDICV_STREAMING { auto pg = VecTraits::svptrue(); single_row_vector_path(pg, src_rows.at(0, index), dst_rows.at(0, index / 2)); }) // Process the remaining chunk of the row. - .remaining([&](size_t index, size_t length) - KLEIDICV_STREAMING_COMPATIBLE { - auto pg = VecTraits::svwhilelt(index, length); - single_row_vector_path(pg, src_rows.at(0, index), - dst_rows.at(0, index / 2)); - }); + .remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + auto pg = VecTraits::svwhilelt(index, length); + single_row_vector_path(pg, src_rows.at(0, index), + dst_rows.at(0, index / 2)); + }); // Handle the last odd column, if any. if (dst_width > (src_width / 2)) { @@ -142,8 +141,8 @@ static inline void process_single_row( } KLEIDICV_TARGET_FN_ATTRS -static kleidicv_error_t check_dimensions(size_t src_dim, size_t dst_dim) - KLEIDICV_STREAMING_COMPATIBLE { +static kleidicv_error_t check_dimensions(size_t src_dim, + size_t dst_dim) KLEIDICV_STREAMING { size_t half_src_dim = src_dim / 2; if ((src_dim % 2) == 0) { @@ -162,7 +161,7 @@ static kleidicv_error_t check_dimensions(size_t src_dim, size_t dst_dim) KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_to_quarter_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, - size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(src_width, src_height); @@ -181,7 +180,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_to_quarter_u8_sc( // Process two rows at once. loop.unroll_once([&](size_t) // NOLINT(readability/casting) - KLEIDICV_STREAMING_COMPATIBLE { + KLEIDICV_STREAMING { process_parallel_rows(src_rows, src_width, dst_rows, dst_width); src_rows += 2; @@ -190,7 +189,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_to_quarter_u8_sc( // Handle an odd row, if any. if (dst_height > (src_height / 2)) { - loop.remaining([&](size_t, size_t) KLEIDICV_STREAMING_COMPATIBLE { + loop.remaining([&](size_t, size_t) KLEIDICV_STREAMING { process_single_row(src_rows, src_width, dst_rows, dst_width); }); }