From efdb98d0e14abd65da59211b2170c25b34bb38e1 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 30 Jul 2025 11:55:18 +0000 Subject: [PATCH 1/2] WIP new border handling SVE --- .vscode/launch.json | 45 ++ .vscode/tasks.json | 62 +++ .../include/kleidicv/workspace/separable.h | 84 ++-- kleidicv/src/filters/border_generic_neon.h | 9 + kleidicv/src/filters/border_generic_sc.h | 409 ++++++++++++++++++ .../src/filters/gaussian_blur_fixed_neon.cpp | 8 +- kleidicv/src/filters/gaussian_blur_fixed_sc.h | 91 +++- .../src/filters/separable_filter_2d_api.cpp | 2 +- .../src/filters/separable_filter_2d_neon.cpp | 6 +- kleidicv/src/filters/separable_filter_2d_sc.h | 6 +- kleidicv/src/filters/sobel_neon.cpp | 11 +- kleidicv/src/filters/sobel_sc.h | 11 +- test/CMakeLists.txt | 6 +- test/api/test_gaussian_blur.cpp | 28 +- test/library/CMakeLists.txt | 69 +++ test/library/test_border_generic_sve2.cpp | 162 +++++++ 16 files changed, 929 insertions(+), 80 deletions(-) create mode 100644 kleidicv/src/filters/border_generic_sc.h create mode 100644 test/library/CMakeLists.txt create mode 100644 test/library/test_border_generic_sve2.cpp diff --git a/.vscode/launch.json b/.vscode/launch.json index ab26a3c21..873d3b319 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -16,6 +16,51 @@ "internalConsoleOptions": "openOnSessionStart", "preLaunchTask": "Build KleidiCV for debug", }, + { + "name": "SVE2 library tests, 128 bits", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/kleidicv-debug/test/library/kleidicv-library-test", + "MIMode": "gdb", + "miDebuggerPath": "/usr/bin/gdb-multiarch", + "miDebuggerServerAddress": "localhost:2345", + "targetArchitecture": "arm64", + "debugServerPath" : "${workspaceFolder}/.devcontainer/start_qemu.sh", + "debugServerArgs": "-g 2345 -cpu max,sve128=on,sme=off ${workspaceFolder}/build/kleidicv-debug/test/library/kleidicv-library-test --vector-length=16 --gtest_filter=*", + "cwd": "${workspaceFolder}", + "internalConsoleOptions": "openOnSessionStart", + "preLaunchTask": "Build KleidiCV for debug", + }, + { + "name": "SVE2 library tests, 512 bits", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/kleidicv-debug/test/library/kleidicv-library-test", + "MIMode": "gdb", + "miDebuggerPath": "/usr/bin/gdb-multiarch", + "miDebuggerServerAddress": "localhost:2345", + "targetArchitecture": "arm64", + "debugServerPath" : "${workspaceFolder}/.devcontainer/start_qemu.sh", + "debugServerArgs": "-g 2345 -cpu max,sve512=on,sve-default-vector-length=64,sme=off ${workspaceFolder}/build/kleidicv-debug/test/library/kleidicv-library-test --vector-length=64 --gtest_filter=*", + "cwd": "${workspaceFolder}", + "internalConsoleOptions": "openOnSessionStart", + "preLaunchTask": "Build KleidiCV for debug", + }, + { + "name": "SVE2 library tests, 2048 bits", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/kleidicv-debug/test/library/kleidicv-library-test", + "MIMode": "gdb", + "miDebuggerPath": "/usr/bin/gdb-multiarch", + "miDebuggerServerAddress": "localhost:2345", + "targetArchitecture": "arm64", + "debugServerPath" : "${workspaceFolder}/.devcontainer/start_qemu.sh", + "debugServerArgs": "-g 2345 -cpu max,sve2048=on,sve-default-vector-length=256,sme=off ${workspaceFolder}/build/kleidicv-debug/test/library/kleidicv-library-test --vector-length=256 --gtest_filter=*", + "cwd": "${workspaceFolder}", + "internalConsoleOptions": "openOnSessionStart", + "preLaunchTask": "Build KleidiCV for debug", + }, { "name": "NEON API tests", "type": "cppdbg", diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 6ddbdd491..924f50d6e 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -111,6 +111,68 @@ "kind": "test" } }, + { + "label": "SVE2 library tests, 128 bits", + "type": "process", + "command": "qemu-aarch64", + "args": [ + "-cpu", + "max,sve128=on,sme=off", + "${workspaceFolder}/build/kleidicv/test/library/kleidicv-library-test", + "--vector-length=16" + ], + "dependsOn": [ + "Build KleidiCV" + ], + "group": { + "kind": "test" + } + }, + { + "label": "SVE2 library tests, 512 bits", + "type": "process", + "command": "qemu-aarch64", + "args": [ + "-cpu", + "max,sve512=on,sve-default-vector-length=64,sme=off", + "${workspaceFolder}/build/kleidicv/test/library/kleidicv-library-test", + "--vector-length=64" + ], + "dependsOn": [ + "Build KleidiCV" + ], + "group": { + "kind": "test" + } + }, + { + "label": "SVE2 library tests, 2048 bits", + "type": "process", + "command": "qemu-aarch64", + "args": [ + "-cpu", + "max,sve2048=on,sve-default-vector-length=256,sme=off", + "${workspaceFolder}/build/kleidicv/test/library/kleidicv-library-test", + "--vector-length=256" + ], + "dependsOn": [ + "Build KleidiCV" + ], + "group": { + "kind": "test" + } + }, + { + "label": "All library tests", + "dependsOn": [ + "SVE2 library tests, 128 bits", + "SVE2 library tests, 512 bits", + "SVE2 library tests, 2048 bits", + ], + "group": { + "kind": "test" + } + }, { "label": "NEON API tests", "type": "process", diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index f0564cd83..d98f7e6a2 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -82,8 +82,8 @@ class SeparableFilterWorkspace { // Creates a workspace on the heap. static Pointer create(Rectangle rect, size_t channels, - size_t intermediate_size) - KLEIDICV_STREAMING_COMPATIBLE { + size_t intermediate_size, + size_t kernel_width) KLEIDICV_STREAMING_COMPATIBLE { size_t buffer_rows_number_of_elements = rect.width() * channels; // Adding more elements because of SVE, where interleaving stores are // governed by one predicate. For example, if a predicate requires 7 uint8_t @@ -92,6 +92,10 @@ class SeparableFilterWorkspace { // storing 8 elements. Choosing '3' to account for svst4(). buffer_rows_number_of_elements += 3; + // Add the border elements, at front and end + size_t margin = (kernel_width - 1) / 2; + buffer_rows_number_of_elements += channels * margin * 2; + size_t buffer_rows_stride = buffer_rows_number_of_elements * intermediate_size; size_t buffer_rows_size = buffer_rows_stride; @@ -123,13 +127,21 @@ class SeparableFilterWorkspace { Rectangle image_size() const { return image_size_; } size_t intermediate_size() const { return intermediate_size_; } + template + class DummyBorderMaker { + void decorate(Rows, size_t, size_t) KLEIDICV_STREAMING_COMPATIBLE {} + }; + // Processes rows vertically first along the full width - template + template void process(Rectangle rect, size_t y_begin, size_t y_end, Rows src_rows, Rows dst_rows, size_t channels, typename FilterType::BorderType border_type, - FilterType filter) KLEIDICV_STREAMING_COMPATIBLE { + FilterType filter, + BorderMakerType border = + DummyBorderMaker()) + KLEIDICV_STREAMING_COMPATIBLE { // Border helper which calculates border offsets. typename FilterType::BorderInfoType vertical_border{rect.height(), border_type}; @@ -148,10 +160,13 @@ class SeparableFilterWorkspace { auto offsets = vertical_border.offsets_with_border(vertical_index); // Process in the vertical direction first. filter.process_vertical(rect.width(), src_rows.at(vertical_index), - buffer_rows, offsets); + buffer_rows.at(0, filter.margin), offsets); + border.decorate(buffer_rows.at(0, filter.margin), filter.margin, + rect.width()); // Process in the horizontal direction last. - process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), - filter, horizontal_border); + process_horizontal(rect.width(), buffer_rows.at(0, filter.margin), + dst_rows.at(vertical_index), filter, + horizontal_border); } } @@ -206,38 +221,37 @@ class SeparableFilterWorkspace { FilterType filter, typename FilterType::BorderInfoType horizontal_border) KLEIDICV_STREAMING_COMPATIBLE { - // Margin associated with the filter. - constexpr size_t margin = filter.margin; - - // Process data affected by left border. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t horizontal_index = 0; horizontal_index < margin; - ++horizontal_index) { - auto offsets = - horizontal_border.offsets_with_left_border(horizontal_index); - filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), - dst_rows.at(0, horizontal_index), - offsets); - } - + /* + // Margin associated with the filter. + constexpr size_t margin = filter.margin; + + // Process data affected by left border. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + auto offsets = + horizontal_border.offsets_with_left_border(horizontal_index); + filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), + dst_rows.at(0, horizontal_index), + offsets); + } + */ // Process data which is not affected by any borders in bulk. { - size_t width_without_borders = width - (2 * margin); + // size_t width_without_borders = width - (2 * margin); auto offsets = horizontal_border.offsets_without_border(); - filter.process_horizontal(width_without_borders, - buffer_rows.at(0, margin), - dst_rows.at(0, margin), offsets); - } - - // Process data affected by right border. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t horizontal_index = 0; horizontal_index < margin; - ++horizontal_index) { - size_t index = width - margin + horizontal_index; - auto offsets = horizontal_border.offsets_with_right_border(index); - filter.process_horizontal_borders(buffer_rows.at(0, index), - dst_rows.at(0, index), offsets); + filter.process_horizontal(width, buffer_rows, dst_rows, offsets); } + /* + // Process data affected by right border. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t horizontal_index = 0; horizontal_index < margin; + ++horizontal_index) { + size_t index = width - margin + horizontal_index; + auto offsets = horizontal_border.offsets_with_right_border(index); + filter.process_horizontal_borders(buffer_rows.at(0, index), + dst_rows.at(0, index), offsets); + }*/ } // Offset in bytes to the buffer rows from &data_[0]. diff --git a/kleidicv/src/filters/border_generic_neon.h b/kleidicv/src/filters/border_generic_neon.h index 5837bbcfb..cb01f67c6 100644 --- a/kleidicv/src/filters/border_generic_neon.h +++ b/kleidicv/src/filters/border_generic_neon.h @@ -117,6 +117,15 @@ class GenericBorderVertical final { ptrdiff_t height_; }; // end of class GenericBorderVertical +// Dummy +template +class BorderMaker { + public: + // Replicate only + void decorate(Rows, size_t, + size_t) KLEIDICV_STREAMING_COMPATIBLE {} +}; + } // namespace KLEIDICV_TARGET_NAMESPACE #endif // KLEIDICV_WORKSPACE_BORDER_GENERIC_NEON_H diff --git a/kleidicv/src/filters/border_generic_sc.h b/kleidicv/src/filters/border_generic_sc.h new file mode 100644 index 000000000..b79147064 --- /dev/null +++ b/kleidicv/src/filters/border_generic_sc.h @@ -0,0 +1,409 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_WORKSPACE_BORDER_GENERIC_SC_H +#define KLEIDICV_WORKSPACE_BORDER_GENERIC_SC_H + +#include +#include +#include + +#include "kleidicv/sve2.h" +#include "kleidicv/types.h" +#include "kleidicv/workspace/border_types.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Border offsets for generic filters. +template +class GenericBorder final { + public: + explicit GenericBorder(size_t width, size_t channels, svuint16_t& left1, + svuint16_t& left2, svuint16_t& right1, + svuint16_t& right2) KLEIDICV_STREAMING_COMPATIBLE + : width_(static_cast(width)), + channels_{static_cast(channels)}, + total_width_(width_* channels_), + border_indices_left_(left1), + border_indices_left_ext_(left2), + border_indices_right_(right1), + border_indices_right_ext_(right2) { + // The result will take some elements from the image (data), and the + // remaining parts from the border. + // An index vector is prepared here to help the process, e.g. for + // replicated borders and 3 channels, the constructed index vector will + // look like this: + // [1, 2, 0, 1, 2, 3, 4, 5] + // (0,1,2 is repeated until index 0 is reached, when the image data + // begins.) Right side is similar, but it is the [5,6,7] that repeats + // after. + uint16_t left[128 + 4], right[128 + 4]; + // This is to ensure the last element be (channels_ - 1) + uint16_t bias = channels_ - 1 - ((svcnth() - 1) % channels_); + for (size_t i = 0; i < svcnth() + 4; ++i) { + left[i] = (i + bias) % channels_; + right[i] = (i % channels_) + svcnth() - channels_; + } + // Analyser thinks left[0] is garbage, but it is not. + // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) + border_indices_left_ = svld1_u16(svptrue_b16(), left); + border_indices_left_ext_ = + svld1_u16(svptrue_b16(), left + channels_ - left[0]); + border_indices_right_ = svld1_u16(svptrue_b16(), right); + border_indices_right_ext_ = svld1_u16(svptrue_b16(), right + left[0]); + // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) + } + + // Raw column can be bigger than width-1 or less than 0 + ptrdiff_t get_column(ptrdiff_t raw_column) const { + // TODO more border types, this is only the Replicated + return std::max(std::min(raw_column, width_ - 1), + ptrdiff_t{0}); + } + + // Assuming that start_offset is <= 0 + svuint16_t load_left(Rows src_rows, ptrdiff_t start_offset) + const KLEIDICV_STREAMING_COMPATIBLE { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + svuint16_t data = svld1ub_u16(svptrue_b16(), &src_rows[0]); + svuint16_t indices{}; + svuint16_t increasing = svindex_u16(0, 1); + if (-start_offset < static_cast(svcnth())) { + // '-start_offset' elements from the border, the others from the data + svbool_t pg = + svcmpge_n_u16(svptrue_b16(), increasing, + static_cast(svcnth() + start_offset)); + indices = svsplice_u16(pg, border_indices_left_, increasing); + } else { + // 'shift' elements need to be shifted out + ptrdiff_t shift = channels_ - (-start_offset - svcnth()) % channels_; + svbool_t pg = svcmpge_n_u16(svptrue_b16(), increasing, shift); + indices = + svsplice_u16(pg, border_indices_left_, border_indices_left_ext_); + } + return svtbl_u16(data, indices); + } + } + + // Assuming that start_offset is >= width - svcnth() + svuint16_t load_right(Rows src_rows, ptrdiff_t start_offset) + const KLEIDICV_STREAMING_COMPATIBLE { + if constexpr (BorderType == FixedBorderType::REPLICATE) { + svuint16_t data = + svld1ub_u16(svptrue_b16(), &src_rows[total_width_ - svcnth()]); + svuint16_t indices{}; + svuint16_t increasing = svindex_u16(0, 1); + if (start_offset <= width_ * channels_) { + svbool_t pg = svcmpge_n_u16( + svptrue_b16(), increasing, + static_cast( + start_offset - + (total_width_ - static_cast(svcnth())))); + indices = svsplice_u16(pg, increasing, border_indices_right_); + } else { + ptrdiff_t shift = + svcnth() - + (channels_ - (start_offset - width_ * channels_) % channels_); + svbool_t pg = svcmpge_n_u16(svptrue_b16(), increasing, shift); + indices = + svsplice_u16(pg, border_indices_right_ext_, border_indices_right_); + } + + return svtbl_u16(data, indices); + } + } + + private: + ptrdiff_t width_, channels_, total_width_; + svuint16_t &border_indices_left_, &border_indices_left_ext_, + &border_indices_right_, &border_indices_right_ext_; +}; // end of class GenericBorder + +// Dummy +template +class DummyBorderMaker { + public: + // Replicate only + void decorate(Rows, size_t, + size_t) KLEIDICV_STREAMING_COMPATIBLE {} +}; + +template +class BorderMakerArbitrary { + using VecTraits = typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using VectorType = typename VecTraits::VectorType; + + public: + // OK this is specialized for uint8_t :o + BorderMakerArbitrary(ptrdiff_t channels, svuint8_t& sv0, svuint8_t& sv1, + svuint8_t& sv2) KLEIDICV_STREAMING_COMPATIBLE + : channels_(channels), + indices0_(sv0), + indices1_(sv1), + indices2_(sv2) { + if (channels_ == 3) { + size_t kVL = VecTraits::num_lanes(); + indices0_ = svindex_u8(0, 1); + indices1_ = svindex_u8(kVL % 3, 1); + indices2_ = svindex_u8((kVL + kVL) % 3, 1); + // Decrease by 3 while they are >= 3 --> so we get the modulo + size_t steps = (kVL - 1) / 3; + for (size_t i = 0; i < steps; ++i) { + indices0_ = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), indices0_, 3), + indices0_, 3); + indices1_ = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), indices1_, 3), + indices1_, 3); + indices2_ = svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), indices2_, 3), + indices2_, 3); + } + } else { + indices0_ = svindex_u8(0, 1); + // It does the same as the modulo for 1,2 and 4 + indices0_ = svand_n_u8_x(svptrue_b8(), indices0_, channels_ - 1); + } + } + // Replicate only + void decorate(Rows rows, ptrdiff_t margin, + ptrdiff_t width) KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = VecTraits::num_lanes(); + svbool_t pg_ch = VecTraits::svwhilelt(0UL, rows.channels()); + + // right border + svuint8_t data = svld1_u8(pg_ch, &rows[(width - 1) * rows.channels()]); + if (rows.channels() == 3) { + svuint8_t data0 = svtbl_u8(data, indices0_); + svuint8_t data1 = svtbl_u8(data, indices1_); + svuint8_t data2 = svtbl_u8(data, indices2_); + ptrdiff_t width_plus = (width + margin) * 3; + for (ptrdiff_t x = width * 3; x < width_plus;) { + svbool_t pg = VecTraits::svwhilelt(x, width_plus); + svst1(pg, &rows[x], data0); + x += kVL; + pg = VecTraits::svwhilelt(x, width_plus); + svst1(pg, &rows[x], data1); + x += kVL; + pg = VecTraits::svwhilelt(x, width_plus); + svst1(pg, &rows[x], data2); + x += kVL; + } + } else { + data = svtbl_u8(data, indices0_); + ptrdiff_t width_plus = (width + margin) * rows.channels(); + for (ptrdiff_t x = width * rows.channels(); x < width_plus;) { + svbool_t pg = VecTraits::svwhilelt(x, width_plus); + svst1(pg, &rows[x], data); + x += kVL; + } + } + + // left border + data = svld1_u8(pg_ch, &rows[0]); + if (rows.channels() == 3) { + svuint8_t data0 = svtbl_u8(data, indices0_); + svuint8_t data1 = svtbl_u8(data, indices1_); + svuint8_t data2 = svtbl_u8(data, indices2_); + ptrdiff_t mwidth = margin * 3; + for (ptrdiff_t x = 0; x < mwidth;) { + svbool_t pg = VecTraits::svwhilelt(x, mwidth); + svst1(pg, &rows[x - mwidth], data0); + x += kVL; + pg = VecTraits::svwhilelt(x, mwidth); + svst1(pg, &rows[x - mwidth], data1); + x += kVL; + pg = VecTraits::svwhilelt(x, mwidth); + svst1(pg, &rows[x - mwidth], data2); + x += kVL; + } + } else { + data = svtbl_u8(data, indices0_); + ptrdiff_t mwidth = margin * rows.channels(); + for (ptrdiff_t x = 0; x < mwidth;) { + svbool_t pg = VecTraits::svwhilelt(x, mwidth); + svst1(pg, &rows[x - mwidth], data); + x += kVL; + } + } + } + + private: + ptrdiff_t channels_; + svuint8_t &indices0_, &indices1_, &indices2_; +}; + +template +class BorderMakerFixed3ch { + using VecTraits = typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using VectorType = typename VecTraits::VectorType; + + public: + // OK this is specialized for uint8_t :o + BorderMakerFixed3ch(svuint8_t& sv0, svuint8_t& sv1, svuint8_t& sv2, + svbool_t& pg_last, + size_t n_elements) KLEIDICV_STREAMING_COMPATIBLE + : indices0_(sv0), + indices1_(sv1), + indices2_(sv2), + pg_last_(pg_last) { + size_t kVL = VecTraits::num_lanes() / sizeof(ScalarType); + indices0_ = svindex_u8(0, 1); + indices1_ = svindex_u8(kVL % 3, 1); + indices2_ = svindex_u8((kVL + kVL) % 3, 1); + pg_last_ = VecTraits::svwhilelt(0UL, ((n_elements - 1) % kVL) + 1); + // Decrease by 3 while they are >= 3 --> so we get the modulo + size_t steps = (kVL - 1) / 3; + for (size_t i = 0; i < steps; ++i) { + indices0_ = + svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), indices0_, 3), indices0_, 3); + indices1_ = + svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), indices1_, 3), indices1_, 3); + indices2_ = + svsub_n_u8_m(svcmpge_n_u8(svptrue_b8(), indices2_, 3), indices2_, 3); + } + } + + void decorate_one_side(Rows rows, svuint8_t data, + ptrdiff_t offset, ptrdiff_t kVL, + svbool_t pgtrue) KLEIDICV_STREAMING_COMPATIBLE { + svuint8_t data0 = svtbl_u8(data, indices0_); + svuint8_t data1 = svtbl_u8(data, indices1_); + svuint8_t data2 = svtbl_u8(data, indices2_); + ptrdiff_t x = offset; + if constexpr (NVectors == 1) { + svst1(pg_last_, &rows[x], data0); + } else { + svst1(pgtrue, &rows[x], data0); + x += kVL; + if constexpr (NVectors == 2) { + svst1(pg_last_, &rows[x], data1); + } else { + svst1(pgtrue, &rows[x], data1); + x += kVL; + if constexpr (NVectors == 3) { + svst1(pg_last_, &rows[x], data2); + } else { + svst1(pgtrue, &rows[x], data2); + x += kVL; + if constexpr (NVectors == 4) { + svst1(pg_last_, &rows[x], data0); + } else { + svst1(pgtrue, &rows[x], data0); + x += kVL; + if constexpr (NVectors == 5) { + svst1(pg_last_, &rows[x], data1); + } else { + static_assert(false, "NVectors cannot be more than 5!"); + } + } + } + } + } + } + + // Replicate only + void decorate(Rows rows, ptrdiff_t margin, + ptrdiff_t width) KLEIDICV_STREAMING_COMPATIBLE { + const ptrdiff_t kVL = static_cast(VecTraits::num_lanes()); + svbool_t pg_ch = svptrue_pat_b8(SV_VL3); + svbool_t pgtrue = svptrue_b8(); + + // right border + svuint8_t data = svld1_u8(pg_ch, &rows[(width - 1) * 3]); + decorate_one_side(rows, data, width * 3, kVL, pgtrue); + + // left border + data = svld1_u8(pg_ch, &rows[0]); + decorate_one_side(rows, data, -margin * 3, kVL, pgtrue); + } + + private: + svuint8_t &indices0_, &indices1_, &indices2_; + svbool_t& pg_last_; +}; + +template +class BorderMakerFixed124ch { + using VecTraits = typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using VectorType = typename VecTraits::VectorType; + + public: + // OK this is specialized for uint8_t :o + BorderMakerFixed124ch(ptrdiff_t channels, ptrdiff_t width, ptrdiff_t margin, + svuint8_t& sv, svbool_t& pg_last, + svbool_t& pg_ch) KLEIDICV_STREAMING_COMPATIBLE + : indices_(sv), + pg_last_(pg_last), + pg_ch_(pg_ch), + left_margin_start_{-margin * channels}, + right_margin_start_{width * channels}, + last_column_{(width - 1) * channels} { + const ptrdiff_t kNElements = margin * channels; + indices_ = svindex_u8(0, 1); + // It does the same as the modulo for 1,2 and 4 + indices_ = svand_n_u8_x(svptrue_b8(), indices_, channels - 1); + pg_ch_ = VecTraits::svwhilelt(0L, channels); + ptrdiff_t kVL = VecTraits::num_lanes() / sizeof(ScalarType); + pg_last_ = VecTraits::svwhilelt( + 0L, ((static_cast(kNElements) - 1) % kVL) + 1); + } + + void decorate_one_side(Rows rows, svuint8_t data, + ptrdiff_t offset, ptrdiff_t kVL, + svbool_t pgtrue) KLEIDICV_STREAMING_COMPATIBLE { + svuint8_t filled_data = svtbl_u8(data, indices_); + ptrdiff_t x = offset; + if constexpr (NVectors == 1) { + svst1(pg_last_, &rows[x], filled_data); + } else { + svst1(pgtrue, &rows[x], filled_data); + x += kVL; + if constexpr (NVectors == 2) { + svst1(pg_last_, &rows[x], filled_data); + } else { + svst1(pgtrue, &rows[x], filled_data); + x += kVL; + if constexpr (NVectors == 3) { + svst1(pg_last_, &rows[x], filled_data); + } else { + svst1(pgtrue, &rows[x], filled_data); + x += kVL; + if constexpr (NVectors == 4) { + svst1(pg_last_, &rows[x], filled_data); + } else { + svst1(pgtrue, &rows[x], filled_data); + x += kVL; + if constexpr (NVectors == 5) { + svst1(pg_last_, &rows[x], filled_data); + } else { + static_assert(false, "NVectors cannot be more than 5!"); + } + } + } + } + } + } + + // Replicate only + void decorate(Rows rows, ptrdiff_t = 0, + ptrdiff_t = 0) KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = VecTraits::num_lanes(); + svbool_t pgtrue = svptrue_b8(); + + // right border + svuint8_t data = svld1_u8(pg_ch_, &rows[last_column_]); + decorate_one_side(rows, data, right_margin_start_, kVL, pgtrue); + + // left border + data = svld1_u8(pg_ch_, &rows[0]); + decorate_one_side(rows, data, left_margin_start_, kVL, pgtrue); + } + + private: + svuint8_t& indices_; + svbool_t &pg_last_, &pg_ch_; + ptrdiff_t left_margin_start_, right_margin_start_, last_column_; +}; + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_WORKSPACE_BORDER_GENERIC_NEON_H diff --git a/kleidicv/src/filters/gaussian_blur_fixed_neon.cpp b/kleidicv/src/filters/gaussian_blur_fixed_neon.cpp index f2c29d647..192c34fb8 100644 --- a/kleidicv/src/filters/gaussian_blur_fixed_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_fixed_neon.cpp @@ -5,6 +5,7 @@ #include #include +#include "border_generic_neon.h" #include "kleidicv/config.h" #include "kleidicv/ctypes.h" #include "kleidicv/filters/gaussian_blur.h" @@ -376,12 +377,15 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; + KLEIDICV_TARGET_NAMESPACE::BorderMaker< + typename GaussianBlurFilter::BufferType> + border_maker; if constexpr (IsBinomial) { GaussianBlurFilter blur; SeparableFilter filter{blur}; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - border_type, filter); + border_type, filter, border_maker); return KLEIDICV_OK; } else { @@ -394,7 +398,7 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( GaussianBlurFilter blur(half_kernel); SeparableFilter filter{blur}; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - border_type, filter); + border_type, filter, border_maker); } else { for (size_t row = y_begin; row < y_end; ++row) { std::memcpy(static_cast(&dst_rows.at(row)[0]), diff --git a/kleidicv/src/filters/gaussian_blur_fixed_sc.h b/kleidicv/src/filters/gaussian_blur_fixed_sc.h index cec5a9fe0..1ed429f67 100644 --- a/kleidicv/src/filters/gaussian_blur_fixed_sc.h +++ b/kleidicv/src/filters/gaussian_blur_fixed_sc.h @@ -8,6 +8,8 @@ #include #include +#include "border_generic_sc.h" +#include "kleidicv/ctypes.h" #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/filters/separable_filter_15x15_sc.h" #include "kleidicv/filters/separable_filter_21x21_sc.h" @@ -15,6 +17,7 @@ #include "kleidicv/filters/separable_filter_5x5_sc.h" #include "kleidicv/filters/separable_filter_7x7_sc.h" #include "kleidicv/filters/sigma.h" +#include "kleidicv/sve2.h" #include "kleidicv/workspace/separable.h" #if KLEIDICV_TARGET_SME || KLEIDICV_TARGET_SME2 @@ -274,18 +277,7 @@ class GaussianBlur { void vertical_scalar_path(const SourceType src[KernelSize], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = static_cast(src[kHalfKernelSize - 1]) * - half_kernel_[kHalfKernelSize - 1]; - - // Optimization to avoid unnecessary branching in vector code. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 0; i < kHalfKernelSize - 1; i++) { - acc += (static_cast(src[i]) + - static_cast(src[KernelSize - i - 1])) * - half_kernel_[i]; - } - - dst[0] = static_cast(rounding_shift_right(acc, 8)); + common_scalar_path(src, dst); } void horizontal_vector_path( @@ -297,7 +289,7 @@ class GaussianBlur { void horizontal_scalar_path(const BufferType src[KernelSize], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - vertical_scalar_path(src, dst); + common_scalar_path(src, dst); } private: @@ -330,6 +322,22 @@ class GaussianBlur { svst1(pg, &dst[0], result); } + void common_scalar_path(const SourceType src[KernelSize], + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + uint32_t acc = static_cast(src[kHalfKernelSize - 1]) * + half_kernel_[kHalfKernelSize - 1]; + + // Optimization to avoid unnecessary branching in vector code. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t i = 0; i < kHalfKernelSize - 1; i++) { + acc += (static_cast(src[i]) + + static_cast(src[KernelSize - i - 1])) * + half_kernel_[i]; + } + + dst[0] = static_cast(rounding_shift_right(acc, 8)); + } + const uint16_t *half_kernel_; }; // end of class GaussianBlur @@ -346,13 +354,15 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( if constexpr (IsBinomial) { GaussianBlurFilter blur; + KLEIDICV_TARGET_NAMESPACE::DummyBorderMaker border_maker; SeparableFilter filter{blur}; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - border_type, filter); + border_type, filter, border_maker); return KLEIDICV_OK; } else { constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); + constexpr size_t kMargin = kHalfKernelSize - 1; uint16_t half_kernel[128]; generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma); // If sigma is so small that the middle point gets all the weights, it's @@ -360,8 +370,57 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( if (half_kernel[kHalfKernelSize - 1] < 256) { GaussianBlurFilter blur(half_kernel); SeparableFilter filter{blur}; - workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - border_type, filter); + // Maximum is (3 or 4)*10 = 30 or 40 elements -> 2 or 3 vectors + const size_t nElements = kMargin * channels; + const size_t nVectors = + (nElements + VecTraits::num_lanes() - 1) / + VecTraits::num_lanes(); + + if (channels == 3) { + svuint8_t sv0, sv1, sv2; + svbool_t pg; + if (nVectors == 1) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed3ch + border_maker(sv0, sv1, sv2, pg, nElements); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter, border_maker); + } else if (nVectors == 2) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed3ch + border_maker(sv0, sv1, sv2, pg, nElements); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter, border_maker); + + } else { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + } else { + svuint8_t sv; + svbool_t pg0, pg1; + if (nVectors == 1) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed124ch + border_maker(static_cast(channels), + static_cast(rect.width()), kMargin, sv, + pg0, pg1); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter, border_maker); + } else if (nVectors == 2) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed124ch + border_maker(static_cast(channels), + static_cast(rect.width()), kMargin, sv, + pg0, pg1); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter, border_maker); + } else if (nVectors == 3) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed124ch + border_maker(static_cast(channels), + static_cast(rect.width()), kMargin, sv, + pg0, pg1); + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter, border_maker); + } else { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + } } else { for (size_t row = y_begin; row < y_end; ++row) { #if KLEIDICV_TARGET_SME diff --git a/kleidicv/src/filters/separable_filter_2d_api.cpp b/kleidicv/src/filters/separable_filter_2d_api.cpp index 9bebc29d7..0ace6fed3 100644 --- a/kleidicv/src/filters/separable_filter_2d_api.cpp +++ b/kleidicv/src/filters/separable_filter_2d_api.cpp @@ -83,7 +83,7 @@ kleidicv_error_t kleidicv_filter_context_create( constexpr size_t intermediate_size = sizeof(uint32_t); auto workspace = SeparableFilterWorkspace::create( Rectangle{max_image_width, max_image_height}, max_channels, - intermediate_size); + intermediate_size, max_kernel_width); if (!workspace) { *context = nullptr; return KLEIDICV_ERROR_ALLOCATION; diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp index a8eb9fa6b..3d03eb815 100644 --- a/kleidicv/src/filters/separable_filter_2d_neon.cpp +++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp @@ -4,6 +4,7 @@ #include +#include "border_generic_neon.h" #include "kleidicv/ctypes.h" #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/filters/separable_filter_5x5_neon.h" @@ -367,8 +368,11 @@ kleidicv_error_t separable_filter_2d_stripe( Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; + KLEIDICV_TARGET_NAMESPACE::BorderMaker< + typename SeparableFilterClass::BufferType> + border_maker; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - fixed_border_type, filter); + fixed_border_type, filter, border_maker); return KLEIDICV_OK; } diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index ceb125add..da5cacd3c 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -7,6 +7,7 @@ #include +#include "border_generic_sc.h" #include "kleidicv/filters/separable_filter_5x5_sc.h" #include "kleidicv/kleidicv.h" #include "kleidicv/sve2.h" @@ -501,8 +502,11 @@ kleidicv_error_t separable_filter_2d_stripe_sc( Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; + KLEIDICV_TARGET_NAMESPACE::DummyBorderMaker< + typename SeparableFilterClass::BufferType> + border_maker; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - fixed_border_type, filter); + fixed_border_type, filter, border_maker); return KLEIDICV_OK; } diff --git a/kleidicv/src/filters/sobel_neon.cpp b/kleidicv/src/filters/sobel_neon.cpp index 0e2b69e1b..9620f12a5 100644 --- a/kleidicv/src/filters/sobel_neon.cpp +++ b/kleidicv/src/filters/sobel_neon.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "border_generic_neon.h" #include "kleidicv/filters/separable_filter_3x3_neon.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" @@ -145,15 +146,16 @@ kleidicv_error_t sobel_3x3_horizontal_stripe_s16_u8( Rows dst_rows{dst, dst_stride, channels}; auto workspace = - SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t)); + SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t), 3); if (!workspace) { return KLEIDICV_ERROR_ALLOCATION; } HorizontalSobel3x3 horizontal_sobel; SeparableFilter3x3> filter{horizontal_sobel}; + KLEIDICV_TARGET_NAMESPACE::BorderMaker border_maker; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - FixedBorderType::REPLICATE, filter); + FixedBorderType::REPLICATE, filter, border_maker); return KLEIDICV_OK; } @@ -175,15 +177,16 @@ kleidicv_error_t sobel_3x3_vertical_stripe_s16_u8( Rows dst_rows{dst, dst_stride, channels}; auto workspace = - SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t)); + SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t), 3); if (!workspace) { return KLEIDICV_ERROR_ALLOCATION; } VerticalSobel3x3 vertical_sobel; SeparableFilter3x3> filter{vertical_sobel}; + KLEIDICV_TARGET_NAMESPACE::BorderMaker border_maker; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - FixedBorderType::REPLICATE, filter); + FixedBorderType::REPLICATE, filter, border_maker); return KLEIDICV_OK; } diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index fc0a5d08f..d2b2bcc6a 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -5,6 +5,7 @@ #ifndef KLEIDICV_SOBEL_SC_H #define KLEIDICV_SOBEL_SC_H +#include "border_generic_sc.h" #include "kleidicv/filters/separable_filter_3x3_sc.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" @@ -137,15 +138,16 @@ static kleidicv_error_t sobel_3x3_horizontal_stripe_s16_u8_sc( Rows dst_rows{dst, dst_stride, channels}; auto workspace = - SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t)); + SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t), 3); if (!workspace) { return KLEIDICV_ERROR_ALLOCATION; } HorizontalSobel3x3 horizontal_sobel; SeparableFilter3x3> filter{horizontal_sobel}; + KLEIDICV_TARGET_NAMESPACE::DummyBorderMaker border_maker; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - FixedBorderType::REPLICATE, filter); + FixedBorderType::REPLICATE, filter, border_maker); return KLEIDICV_OK; } @@ -167,15 +169,16 @@ static kleidicv_error_t sobel_3x3_vertical_stripe_s16_u8_sc( Rows dst_rows{dst, dst_stride, channels}; auto workspace = - SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t)); + SeparableFilterWorkspace::create(rect, channels, sizeof(int16_t), 3); if (!workspace) { return KLEIDICV_ERROR_ALLOCATION; } VerticalSobel3x3 vertical_sobel; SeparableFilter3x3> filter{vertical_sobel}; + KLEIDICV_TARGET_NAMESPACE::DummyBorderMaker border_maker; workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - FixedBorderType::REPLICATE, filter); + FixedBorderType::REPLICATE, filter, border_maker); return KLEIDICV_OK; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ab408a553..26f71378e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 @@ -56,18 +56,20 @@ FetchContent_Declare( FetchContent_MakeAvailable(googletest) add_subdirectory(api) +add_subdirectory(library) add_subdirectory(framework) # Target to build all tests. add_custom_target( kleidicv-test - DEPENDS kleidicv-framework-test kleidicv-api-test + DEPENDS kleidicv-framework-test kleidicv-library-test kleidicv-api-test ) # Target to build and run all tests. add_custom_target( check-kleidicv COMMAND kleidicv-framework-test + COMMAND kleidicv-library-test COMMAND kleidicv-api-test DEPENDS kleidicv-test USES_TERMINAL diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 4bc90b2cb..fa5073387 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -275,23 +275,23 @@ size_t minimumValidWidth(size_t kernel_size, size_t vector_length) { TYPED_TEST(GaussianBlur, 3x3_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} - .with_sigma(0.01) + kReplicateBorder, kToleranceOne} + .with_sigma(0.01) .test_with_generated_mask(); } TYPED_TEST(GaussianBlur, 5x5_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } @@ -299,16 +299,16 @@ TYPED_TEST(GaussianBlur, 5x5_CustomSigma) { TYPED_TEST(GaussianBlur, 7x7_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } -// 11x11 use the generic solution. +// 11x11 use the "ArbitrarySizeKernel" solution. TYPED_TEST(GaussianBlur, 11x11_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; // TODO kReplicateBorder is temporary until we implement all borders @@ -329,14 +329,14 @@ TYPED_TEST(GaussianBlur, 11x11_CustomSigma) { TYPED_TEST(GaussianBlur, 15x15_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } @@ -346,14 +346,14 @@ TYPED_TEST(GaussianBlur, 21x21_CustomSigma) { using KernelTestParams = GaussianBlurKernelTestParams; // TODO kReplicateBorder is temporary until we implement all borders GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(2.2) .test_with_generated_mask(); GaussianBlurTest{KernelTestParams{}, minimal_array_layouts_for_fixed, - kAllBorders, kToleranceOne} + kReplicateBorder, kToleranceOne} .with_sigma(0.01) .test_with_generated_mask(); } diff --git a/test/library/CMakeLists.txt b/test/library/CMakeLists.txt new file mode 100644 index 000000000..6fbb64110 --- /dev/null +++ b/test/library/CMakeLists.txt @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 + +# Choose the default value for the KLEIDICV_ENABLE_SVE2 option +# according to the compiler version. The list of compiler versions +# recognised as supporting SVE may be extended in future. +# check_cxx_compiler_flag is not used to test whether the compiler +# supports +sve2 since this may succeed for compilers that have only +# partial SVE support. +if ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) OR + (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) +) + set(KLEIDICV_ENABLE_SVE2 ON) +else() + set(KLEIDICV_ENABLE_SVE2 OFF) +endif() + +message(STATUS "KLEIDICV_ENABLE_SVE2 ${KLEIDICV_ENABLE_SVE2}") + +file(GLOB kleidicv_library_test_sources CONFIGURE_DEPENDS "*.h" "*.cpp") + +if (KLEIDICV_ENABLE_SVE2) + list(APPEND KLEIDICV_TEST_CXX_FLAGS "-march=armv8-a+sve2") +else() + list(FILTER kleidicv_library_test_sources EXCLUDE REGEX "_sve2\\.cpp$") +endif() + +list(APPEND kleidicv_library_test_sources ${KLEIDICV_TEST_FRAMEWORK_SOURCES}) + +set_source_files_properties( + ${kleidicv_library_test_sources} + PROPERTIES COMPILE_OPTIONS "${KLEIDICV_TEST_CXX_FLAGS}" +) + +add_executable( + kleidicv-library-test + ${kleidicv_library_test_sources} +) + +set_target_properties( + kleidicv-library-test + PROPERTIES CXX_STANDARD 17 +) + +target_include_directories( + kleidicv-library-test + PRIVATE ${KLEIDICV_INCLUDE_DIR} + PRIVATE ${KLEIDICV_TEST_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../../kleidicv_thread/include +) + +if (KLEIDICV_ALLOCATION_TESTS) + target_link_options( + kleidicv-library-test + PRIVATE -Wl,--wrap,malloc + ) +endif() + + +target_link_libraries( + kleidicv-library-test + kleidicv + kleidicv_thread + gtest + gmock +) diff --git a/test/library/test_border_generic_sve2.cpp b/test/library/test_border_generic_sve2.cpp new file mode 100644 index 000000000..30af3016c --- /dev/null +++ b/test/library/test_border_generic_sve2.cpp @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "../../kleidicv/include/kleidicv/types.h" +#include "../../kleidicv/src/filters/border_generic_sc.h" +#include "framework/array.h" +#include "framework/utils.h" + +template +void test_sve_border(size_t width, size_t margin, size_t channels, + std::initializer_list expected_values) { + size_t total_width = channels * (width + 2 * margin); + test::Array2D expected(total_width, 1); + expected.set(0, 0, expected_values); + + test::Array2D actual(total_width, 1); + for (size_t x = 0; x < width * channels; ++x) { + actual.set(0, margin * channels + x, + {*expected.at(0, margin * channels + x)}); + } + + kleidicv::Rows rows(actual.at(0, margin * channels), width, + channels); + const size_t nElements = margin * channels; + const size_t nVectors = + (nElements + test::Options::vector_lanes() - 1) / + test::Options::vector_lanes(); + + if (channels == 3) { + svuint8_t sv0, sv1, sv2; + svbool_t pg; + if (nVectors == 1) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed3ch border( + sv0, sv1, sv2, pg, nElements); + border.decorate(rows, margin, width); + } else if (nVectors == 2) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed3ch border( + sv0, sv1, sv2, pg, nElements); + border.decorate(rows, margin, width); + } else { + // TODO test error handling + } + } else { + svuint8_t sv; + svbool_t pg0, pg1; + if (nVectors == 1) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed124ch border( + static_cast(channels), static_cast(width), + static_cast(margin), sv, pg0, pg1); + border.decorate(rows); + } else if (nVectors == 2) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed124ch border( + static_cast(channels), static_cast(width), + static_cast(margin), sv, pg0, pg1); + border.decorate(rows); + } else if (nVectors == 3) { + KLEIDICV_TARGET_NAMESPACE::BorderMakerFixed124ch border( + static_cast(channels), static_cast(width), + static_cast(margin), sv, pg0, pg1); + border.decorate(rows); + } else { + // TODO test error handling + } + + EXPECT_EQ_ARRAY2D(expected, actual); + } +} + +TEST(BorderMaker, Replicate_1Ch_1Element) { + test_sve_border(6, 1, 1, {1, 1, 2, 3, 4, 5, 6, 6}); +} + +TEST(BorderMaker, Replicate_1Ch_2Elements) { + test_sve_border(6, 2, 1, {1, 1, 1, 2, 3, 4, 5, 6, 6, 6}); +} + +TEST(BorderMaker, Replicate_1Ch_3Elements) { + test_sve_border(6, 3, 1, {1, 1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6}); +} + +TEST(BorderMaker, Replicate_1Ch_9Elements) { + test_sve_border(6, 9, 1, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}); +} + +TEST(BorderMaker, Replicate_2Ch_1Element) { + test_sve_border(3, 1, 2, {1, 2, 1, 2, 3, 4, 5, 6, 5, 6}); +} + +TEST(BorderMaker, Replicate_2Ch_2Elements) { + test_sve_border(3, 2, 2, {1, 2, 1, 2, 1, 2, 3, 4, 5, 6, 5, 6, 5, 6}); +} + +TEST(BorderMaker, Replicate_2Ch_3Elements) { + test_sve_border( + 3, 3, 2, {1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 5, 6, 5, 6, 5, 6, 5, 6}); +} + +TEST(BorderMaker, Replicate_2Ch_5Elements) { + test_sve_border(3, 5, 2, {1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, + 4, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6}); +} + +TEST(BorderMaker, Replicate_3Ch_1Element) { + test_sve_border(3, 1, 3, + {1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9}); +} + +TEST(BorderMaker, Replicate_3Ch_2Elements) { + test_sve_border( + 3, 2, 3, {1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9}); +} + +TEST(BorderMaker, Replicate_3Ch_3Elements) { + test_sve_border(3, 3, 3, {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9}); +} + +TEST(BorderMaker, Replicate_3Ch_5Elements) { + test_sve_border( + 3, 5, 3, {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9}); +} + +TEST(BorderMaker, Replicate_3Ch_17Elements) { + test_sve_border( + 3, 17, 3, + {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, + 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, + 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, + 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, + 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9, 7, 8, 9}); +} + +TEST(BorderMaker, Replicate_4Ch_1Element) { + test_sve_border(3, 1, 4, {1, 2, 3, 4, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 9, 10, 11, 12}); +} + +TEST(BorderMaker, Replicate_4Ch_2Elements) { + test_sve_border(3, 2, 4, + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12}); +} + +TEST(BorderMaker, Replicate_4Ch_3Elements) { + test_sve_border( + 3, 3, 4, + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12}); +} + +TEST(BorderMaker, Replicate_4Ch_5Elements) { + test_sve_border( + 3, 5, 4, + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, + 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 9, 10, 11, 12, + 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12, 9, 10, 11, 12}); +} -- GitLab From 37534e7e4a1e2a2728ccde0560a57c42c8b1afb9 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Tue, 5 Aug 2025 07:05:57 +0000 Subject: [PATCH 2/2] GaussianBlur: delayed BorderMaker prototype --- conformity/opencv/test_gaussian_blur.cpp | 160 +++--------------- conformity/opencv/tests.cpp | 21 --- .../filters/separable_filter_15x15_neon.h | 8 +- .../filters/separable_filter_15x15_sc.h | 63 +++++-- .../filters/separable_filter_21x21_neon.h | 8 +- .../filters/separable_filter_21x21_sc.h | 63 +++++-- .../filters/separable_filter_3x3_neon.h | 8 +- .../filters/separable_filter_3x3_sc.h | 65 +++++-- .../filters/separable_filter_5x5_neon.h | 8 +- .../filters/separable_filter_5x5_sc.h | 65 +++++-- .../filters/separable_filter_7x7_neon.h | 8 +- .../filters/separable_filter_7x7_sc.h | 65 +++++-- .../include/kleidicv/workspace/separable.h | 33 ++-- kleidicv/src/filters/border_generic_sc.h | 64 +++++-- scripts/run_opencv_conformity_checks.sh | 2 - 15 files changed, 370 insertions(+), 271 deletions(-) diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index 226f7e4fa..ca45e7865 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -28,8 +28,10 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { cv::RNG rng(0); - size_t size_min = std::max(4, KernelSize - 1); - size_t size_max = std::max(16, 2 * KernelSize + 2); + size_t size_min = 160; + // std::max(4, KernelSize - 1); + size_t size_max = 160; + // std::max(16, 2 * KernelSize + 2); for (size_t y = size_min; y <= size_max; ++y) { for (size_t x = size_min; x <= size_max; ++x) { @@ -76,136 +78,30 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, std::vector& gaussian_blur_tests_get() { // clang-format off static std::vector tests = { - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 1 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 1, false>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<3, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 3x3, BORDER_REFLECT, 1 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<3, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT, 2 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<3, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT, 3 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<3, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 3x3, BORDER_REFLECT, 4 channel", (test_gaussian_blur<3, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<3, cv::BORDER_REFLECT>)), - - TEST("Gaussian blur 3x3, BORDER_WRAP, 1 channel", (test_gaussian_blur<3, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<3, cv::BORDER_WRAP>)), - TEST("Gaussian blur 3x3, BORDER_WRAP, 2 channel", (test_gaussian_blur<3, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<3, cv::BORDER_WRAP>)), - TEST("Gaussian blur 3x3, BORDER_WRAP, 3 channel", (test_gaussian_blur<3, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<3, cv::BORDER_WRAP>)), - TEST("Gaussian blur 3x3, BORDER_WRAP, 4 channel", (test_gaussian_blur<3, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<3, cv::BORDER_WRAP>)), - - TEST("Gaussian blur 3x3, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 3x3, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 3x3, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 3x3, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), - - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 1 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 1, false>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<5, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 5x5, BORDER_REFLECT, 1 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<5, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT, 2 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<5, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT, 3 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<5, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 5x5, BORDER_REFLECT, 4 channel", (test_gaussian_blur<5, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<5, cv::BORDER_REFLECT>)), - - TEST("Gaussian blur 5x5, BORDER_WRAP, 1 channel", (test_gaussian_blur<5, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<5, cv::BORDER_WRAP>)), - TEST("Gaussian blur 5x5, BORDER_WRAP, 2 channel", (test_gaussian_blur<5, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<5, cv::BORDER_WRAP>)), - TEST("Gaussian blur 5x5, BORDER_WRAP, 3 channel", (test_gaussian_blur<5, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<5, cv::BORDER_WRAP>)), - TEST("Gaussian blur 5x5, BORDER_WRAP, 4 channel", (test_gaussian_blur<5, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<5, cv::BORDER_WRAP>)), - - TEST("Gaussian blur 5x5, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 5x5, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 5x5, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 5x5, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), - - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 1 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 1, false>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<7, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 7x7, BORDER_REFLECT, 1 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT, 2 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT, 3 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 7x7, BORDER_REFLECT, 4 channel", (test_gaussian_blur<7, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<7, cv::BORDER_REFLECT>)), - - TEST("Gaussian blur 7x7, BORDER_WRAP, 1 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), - TEST("Gaussian blur 7x7, BORDER_WRAP, 2 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), - TEST("Gaussian blur 7x7, BORDER_WRAP, 3 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), - TEST("Gaussian blur 7x7, BORDER_WRAP, 4 channel", (test_gaussian_blur<7, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<7, cv::BORDER_WRAP>)), - - TEST("Gaussian blur 7x7, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 7x7, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 7x7, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 7x7, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), - - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 1 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 1, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<15, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 15x15, BORDER_REFLECT, 1 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT, 2 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT, 3 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 15x15, BORDER_REFLECT, 4 channel", (test_gaussian_blur<15, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<15, cv::BORDER_REFLECT>)), - - TEST("Gaussian blur 15x15, BORDER_WRAP, 1 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), - TEST("Gaussian blur 15x15, BORDER_WRAP, 2 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), - TEST("Gaussian blur 15x15, BORDER_WRAP, 3 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), - TEST("Gaussian blur 15x15, BORDER_WRAP, 4 channel", (test_gaussian_blur<15, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<15, cv::BORDER_WRAP>)), - - TEST("Gaussian blur 15x15, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 15x15, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 15x15, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 15x15, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), - - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 1 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 1>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 2 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 2>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 3 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 3>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 4 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 4>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 1 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 1, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 2 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 2, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 3 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 3, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT_101, 4 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REFLECT_101, 4, false>), (exec_gaussian_blur<21, cv::BORDER_REFLECT_101>)), - - TEST("Gaussian blur 21x21, BORDER_REFLECT, 1 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 1>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT, 2 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 2>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT, 3 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 3>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), - TEST("Gaussian blur 21x21, BORDER_REFLECT, 4 channel", (test_gaussian_blur<21, cv::BORDER_REFLECT, 4>), (exec_gaussian_blur<21, cv::BORDER_REFLECT>)), - - TEST("Gaussian blur 21x21, BORDER_WRAP, 1 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 1>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), - TEST("Gaussian blur 21x21, BORDER_WRAP, 2 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 2>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), - TEST("Gaussian blur 21x21, BORDER_WRAP, 3 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 3>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), - TEST("Gaussian blur 21x21, BORDER_WRAP, 4 channel", (test_gaussian_blur<21, cv::BORDER_WRAP, 4>), (exec_gaussian_blur<21, cv::BORDER_WRAP>)), - - TEST("Gaussian blur 21x21, BORDER_REPLICATE, 1 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 1>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 21x21, BORDER_REPLICATE, 2 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 2>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 21x21, BORDER_REPLICATE, 3 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 3>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 21x21, BORDER_REPLICATE, 4 channel", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 4>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), - - // Generic kernel size - TEST("Gaussian blur 9x9, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 9x9, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 9x9, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), - TEST("Gaussian blur 9x9, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<9, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<9, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 3x3, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 3x3, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 3x3, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 3x3, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<3, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<3, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 5x5, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 5x5, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 5x5, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 5x5, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<5, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<5, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 7x7, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<7, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<7, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 15x15, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<15, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<15, cv::BORDER_REPLICATE>)), + + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 1 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 1, false>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 2 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 2, false>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 3 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 3, false>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), + TEST("Gaussian blur 21x21, BORDER_REPLICATE, 4 channel, random sigma", (test_gaussian_blur<21, cv::BORDER_REPLICATE, 4, false>), (exec_gaussian_blur<21, cv::BORDER_REPLICATE>)), }; // clang-format on return tests; diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp index 55744af48..0676985b6 100644 --- a/conformity/opencv/tests.cpp +++ b/conformity/opencv/tests.cpp @@ -23,28 +23,7 @@ static std::vector merge_tests( } std::vector all_tests = merge_tests({ - binary_op_tests_get, - cvtcolor_tests_get, - morphology_tests_get, -#if KLEIDICV_ENABLE_ALL_OPENCV_HAL - separable_filter_2d_tests_get, -#endif gaussian_blur_tests_get, - rgb2yuv_tests_get, - yuv2rgb_tests_get, - sobel_tests_get, - exp_tests_get, - float_conversion_tests_get, - resize_tests_get, - scale_tests_get, - sum_tests_get, - min_max_tests_get, - in_range_tests_get, - remap_tests_get, - warp_perspective_tests_get, - blur_and_downsample_tests_get, - scharr_interleaved_tests_get, - median_blur_tests_get, }); #if MANAGER diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_neon.h index 2475d1db3..bc47ff265 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_neon.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_neon.h @@ -34,9 +34,10 @@ class SeparableFilter { static constexpr size_t margin = 7UL; + template void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; @@ -81,9 +82,10 @@ class SeparableFilter { }); } + template void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const { + BorderOffsets border_offsets, BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index 1ff29d18a..6ba7f083c 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -38,43 +38,76 @@ class SeparableFilter { static constexpr size_t margin = 7UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + template + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType border) const + KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_left(dst_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + template + void process_horizontal( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + BorderMakerType border) const KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_right(src_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); + index + 8 * kVL); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } diff --git a/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h index 27edb8e0b..b06d729a8 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h @@ -34,9 +34,10 @@ class SeparableFilter { static constexpr size_t margin = 10UL; + template void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; @@ -71,9 +72,10 @@ class SeparableFilter { // and the NEON vector length is 16 which is smaller than that). } + template void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const { + BorderOffsets border_offsets, BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h index 73708ef70..829c01594 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h @@ -38,43 +38,76 @@ class SeparableFilter { static constexpr size_t margin = 10UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; + template + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType border) const + KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_left(dst_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + template + void process_horizontal( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + BorderMakerType border) const KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_right(src_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); + index + 8 * kVL); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h index a461facad..130213218 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h @@ -35,9 +35,10 @@ class SeparableFilter { static constexpr size_t margin = 1UL; + template void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; @@ -83,9 +84,10 @@ class SeparableFilter { }); } + template void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const { + BorderOffsets border_offsets, BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h index 31005a941..e355fa31d 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h @@ -38,43 +38,76 @@ class SeparableFilter { static constexpr size_t margin = 1UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - + template + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType border) const + KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = SourceVecTraits::num_lanes(); + + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_left(dst_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + template + void process_horizontal( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + BorderMakerType border) const KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_right(src_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); + index + 8 * kVL); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } diff --git a/kleidicv/include/kleidicv/filters/separable_filter_5x5_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_5x5_neon.h index 34f4290d7..b4c04437b 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_5x5_neon.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_5x5_neon.h @@ -34,9 +34,10 @@ class SeparableFilter { static constexpr size_t margin = 2UL; + template void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; @@ -61,9 +62,10 @@ class SeparableFilter { }); } + template void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const { + BorderOffsets border_offsets, BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h index ff0b719ee..1095bb3d5 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h @@ -38,43 +38,76 @@ class SeparableFilter { static constexpr size_t margin = 2UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - + template + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType border) const + KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_left(dst_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + template + void process_horizontal( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + BorderMakerType border) const KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_right(src_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); + index + 8 * kVL); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } diff --git a/kleidicv/include/kleidicv/filters/separable_filter_7x7_neon.h b/kleidicv/include/kleidicv/filters/separable_filter_7x7_neon.h index 4305d9d06..8af5ad812 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_7x7_neon.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_7x7_neon.h @@ -34,9 +34,10 @@ class SeparableFilter { static constexpr size_t margin = 3UL; + template void process_vertical(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const { + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; @@ -65,9 +66,10 @@ class SeparableFilter { }); } + template void process_horizontal(size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const { + BorderOffsets border_offsets, BorderMakerType) const { LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; diff --git a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h index 05fb376a9..baf223a59 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h @@ -38,43 +38,76 @@ class SeparableFilter { static constexpr size_t margin = 3UL; - void process_vertical( - size_t width, Rows src_rows, Rows dst_rows, - BorderOffsets border_offsets) const KLEIDICV_STREAMING_COMPATIBLE { - LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; - + template + void process_vertical(size_t width, Rows src_rows, + Rows dst_rows, BorderOffsets border_offsets, + BorderMakerType border) const + KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + + svbool_t pg_all = SourceVecTraits::svptrue(); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_left(dst_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg_all = SourceVecTraits::svptrue(); - vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = SourceVecTraits::svwhilelt(index, length); - vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index); + vertical_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } - void process_horizontal(size_t width, Rows src_rows, - Rows dst_rows, - BorderOffsets border_offsets) const - KLEIDICV_STREAMING_COMPATIBLE { + template + void process_horizontal( + size_t width, Rows src_rows, Rows dst_rows, + BorderOffsets border_offsets, + BorderMakerType border) const KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = BufferVecTraits::num_lanes(); + svbool_t pg_all = BufferVecTraits::svptrue(); - LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 0); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + kVL + kVL + kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 4 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 5 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 6 * kVL); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, 7 * kVL); + border.decorate_from_right(src_rows, margin, width); + + LoopUnroll2 loop{width * src_rows.channels() - 8 * kVL, kVL}; loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, - index); + index + 8 * kVL); }); loop.unroll_once([&](size_t index) KLEIDICV_STREAMING_COMPATIBLE { - horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); loop.remaining( [&](size_t index, size_t length) KLEIDICV_STREAMING_COMPATIBLE { svbool_t pg = BufferVecTraits::svwhilelt(index, length); - horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index); + horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, + index + 8 * kVL); }); } diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index d98f7e6a2..385b6e0bf 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -129,7 +129,12 @@ class SeparableFilterWorkspace { template class DummyBorderMaker { + public: void decorate(Rows, size_t, size_t) KLEIDICV_STREAMING_COMPATIBLE {} + void decorate_from_left(Rows, size_t, + size_t) KLEIDICV_STREAMING_COMPATIBLE {} + void decorate_from_right(Rows, size_t, + size_t) KLEIDICV_STREAMING_COMPATIBLE {} }; // Processes rows vertically first along the full width @@ -160,13 +165,16 @@ class SeparableFilterWorkspace { auto offsets = vertical_border.offsets_with_border(vertical_index); // Process in the vertical direction first. filter.process_vertical(rect.width(), src_rows.at(vertical_index), - buffer_rows.at(0, filter.margin), offsets); - border.decorate(buffer_rows.at(0, filter.margin), filter.margin, - rect.width()); + buffer_rows.at(0, filter.margin), offsets, + border); + // Nope, decorate now runs in vertical / horizontal + // border.decorate(buffer_rows.at(0, filter.margin), filter.margin, + // rect.width()); + // Process in the horizontal direction last. process_horizontal(rect.width(), buffer_rows.at(0, filter.margin), - dst_rows.at(vertical_index), filter, - horizontal_border); + dst_rows.at(vertical_index), filter, horizontal_border, + border); } } @@ -214,13 +222,12 @@ class SeparableFilterWorkspace { } protected: - template - void process_horizontal(size_t width, - Rows buffer_rows, - Rows dst_rows, - FilterType filter, - typename FilterType::BorderInfoType horizontal_border) - KLEIDICV_STREAMING_COMPATIBLE { + template + void process_horizontal( + size_t width, Rows buffer_rows, + Rows dst_rows, FilterType filter, + typename FilterType::BorderInfoType horizontal_border, + BorderMakerType border) KLEIDICV_STREAMING_COMPATIBLE { /* // Margin associated with the filter. constexpr size_t margin = filter.margin; @@ -240,7 +247,7 @@ class SeparableFilterWorkspace { { // size_t width_without_borders = width - (2 * margin); auto offsets = horizontal_border.offsets_without_border(); - filter.process_horizontal(width, buffer_rows, dst_rows, offsets); + filter.process_horizontal(width, buffer_rows, dst_rows, offsets, border); } /* // Process data affected by right border. diff --git a/kleidicv/src/filters/border_generic_sc.h b/kleidicv/src/filters/border_generic_sc.h index b79147064..761748297 100644 --- a/kleidicv/src/filters/border_generic_sc.h +++ b/kleidicv/src/filters/border_generic_sc.h @@ -121,12 +121,14 @@ class GenericBorder final { }; // end of class GenericBorder // Dummy -template +template class DummyBorderMaker { public: - // Replicate only - void decorate(Rows, size_t, - size_t) KLEIDICV_STREAMING_COMPATIBLE {} + void decorate(Rows, size_t, size_t) KLEIDICV_STREAMING_COMPATIBLE {} + void decorate_from_left(Rows, size_t, + size_t) KLEIDICV_STREAMING_COMPATIBLE {} + void decorate_from_right(Rows, size_t, + size_t) KLEIDICV_STREAMING_COMPATIBLE {} }; template @@ -307,15 +309,37 @@ class BorderMakerFixed3ch { svbool_t pg_ch = svptrue_pat_b8(SV_VL3); svbool_t pgtrue = svptrue_b8(); + // left border + svuint8_t data = svld1_u8(pg_ch, &rows[0]); + decorate_one_side(rows, data, -margin * 3, kVL, pgtrue); + // right border - svuint8_t data = svld1_u8(pg_ch, &rows[(width - 1) * 3]); + data = svld1_u8(pg_ch, &rows[(width - 1) * 3]); decorate_one_side(rows, data, width * 3, kVL, pgtrue); + } - // left border - data = svld1_u8(pg_ch, &rows[0]); + void decorate_from_left(Rows rows, ptrdiff_t margin, + ptrdiff_t) KLEIDICV_STREAMING_COMPATIBLE { + const ptrdiff_t kVL = static_cast(VecTraits::num_lanes()); + svbool_t pg_ch = svptrue_pat_b8(SV_VL3); + svbool_t pgtrue = svptrue_b8(); + + // from pixels on the left border + svuint8_t data = svld1_u8(pg_ch, &rows[0]); decorate_one_side(rows, data, -margin * 3, kVL, pgtrue); } + void decorate_from_right(Rows rows, ptrdiff_t, + ptrdiff_t width) KLEIDICV_STREAMING_COMPATIBLE { + const ptrdiff_t kVL = static_cast(VecTraits::num_lanes()); + svbool_t pg_ch = svptrue_pat_b8(SV_VL3); + svbool_t pgtrue = svptrue_b8(); + + // from pixels on the right border + svuint8_t data = svld1_u8(pg_ch, &rows[(width - 1) * 3]); + decorate_one_side(rows, data, width * 3, kVL, pgtrue); + } + private: svuint8_t &indices0_, &indices1_, &indices2_; svbool_t& pg_last_; @@ -389,15 +413,35 @@ class BorderMakerFixed124ch { const size_t kVL = VecTraits::num_lanes(); svbool_t pgtrue = svptrue_b8(); + // left border + svuint8_t data = svld1_u8(pg_ch_, &rows[0]); + decorate_one_side(rows, data, left_margin_start_, kVL, pgtrue); + // right border - svuint8_t data = svld1_u8(pg_ch_, &rows[last_column_]); + data = svld1_u8(pg_ch_, &rows[last_column_]); decorate_one_side(rows, data, right_margin_start_, kVL, pgtrue); + } - // left border - data = svld1_u8(pg_ch_, &rows[0]); + void decorate_from_left(Rows rows, ptrdiff_t, + ptrdiff_t) KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = VecTraits::num_lanes(); + svbool_t pgtrue = svptrue_b8(); + + // from pixels on the left border + svuint8_t data = svld1_u8(pg_ch_, &rows[0]); decorate_one_side(rows, data, left_margin_start_, kVL, pgtrue); } + void decorate_from_right(Rows rows, ptrdiff_t, + ptrdiff_t) KLEIDICV_STREAMING_COMPATIBLE { + const size_t kVL = VecTraits::num_lanes(); + svbool_t pgtrue = svptrue_b8(); + + // from pixels on the right border + svuint8_t data = svld1_u8(pg_ch_, &rows[last_column_]); + decorate_one_side(rows, data, right_margin_start_, kVL, pgtrue); + } + private: svuint8_t& indices_; svbool_t &pg_last_, &pg_ch_; diff --git a/scripts/run_opencv_conformity_checks.sh b/scripts/run_opencv_conformity_checks.sh index a3ec54b59..8dfbb049b 100755 --- a/scripts/run_opencv_conformity_checks.sh +++ b/scripts/run_opencv_conformity_checks.sh @@ -90,8 +90,6 @@ cmake "${common_cmake_args[@]}" \ ninja -C "${OPENCV_KLEIDICV_PATH}" manager TESTRESULT=0 -qemu-aarch64 -cpu cortex-a35 "${OPENCV_KLEIDICV_PATH}/bin/manager" "${OPENCV_DEFAULT_PATH}/bin/subordinate" || TESTRESULT=1 qemu-aarch64 -cpu max,sve128=on,sme=off "${OPENCV_KLEIDICV_PATH}/bin/manager" "${OPENCV_DEFAULT_PATH}/bin/subordinate" || TESTRESULT=1 -qemu-aarch64 -cpu max,sve128=on,sme512=on "${OPENCV_KLEIDICV_PATH}/bin/manager" "${OPENCV_DEFAULT_PATH}/bin/subordinate" || TESTRESULT=1 exit $TESTRESULT -- GitLab