From b46db5f819e38f26e7e6e0f3736253a88432381f Mon Sep 17 00:00:00 2001 From: Michael Platings Date: Tue, 7 Jan 2025 17:08:23 +0000 Subject: [PATCH 1/2] Remove SME remap implementation --- kleidicv/src/transform/remap_api.cpp | 7 +- kleidicv/src/transform/remap_sc.h | 132 ++------------------------ kleidicv/src/transform/remap_sme2.cpp | 34 ------- 3 files changed, 11 insertions(+), 162 deletions(-) delete mode 100644 kleidicv/src/transform/remap_sme2.cpp diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp index 5df6feb1c..3abc4071f 100644 --- a/kleidicv/src/transform/remap_api.cpp +++ b/kleidicv/src/transform/remap_api.cpp @@ -14,7 +14,6 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16_u16, &kleidicv::neon::remap_s16, &kleidicv::sve2::remap_s16, nullptr); -KLEIDICV_MULTIVERSION_C_API( - kleidicv_remap_s16point5_u8, &kleidicv::neon::remap_s16point5, - &kleidicv::sve2::remap_s16point5, - KLEIDICV_SME2_IMPL_IF(&kleidicv::sme2::remap_s16point5)); +KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u8, + &kleidicv::neon::remap_s16point5, + &kleidicv::sve2::remap_s16point5, nullptr); diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index af88303ea..00fd7de13 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -19,8 +19,6 @@ namespace KLEIDICV_TARGET_NAMESPACE { -#if !KLEIDICV_TARGET_SME2 - template class RemapS16 { public: @@ -162,13 +160,11 @@ kleidicv_error_t remap_s16_sc(const T* src, size_t src_stride, size_t src_width, return KLEIDICV_OK; } -#endif // KLEIDICV_TARGET_SME2 - template -class RemapS16Point5SVE2; +class RemapS16Point5; template <> -class RemapS16Point5SVE2 { +class RemapS16Point5 { public: using ScalarType = uint8_t; using MapVecTraits = VecTraits; @@ -177,9 +173,9 @@ class RemapS16Point5SVE2 { using FracVecTraits = VecTraits; using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5SVE2(Rows src_rows, size_t src_width, - size_t src_height, svuint16_t& v_src_stride, - MapVectorType& v_x_max, MapVectorType& v_y_max) + RemapS16Point5(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) : src_rows_{src_rows}, v_src_stride_{v_src_stride}, v_xmax_{v_x_max}, @@ -301,114 +297,7 @@ class RemapS16Point5SVE2 { svuint16_t& v_src_stride_; MapVectorType& v_xmax_; MapVectorType& v_ymax_; -}; // end of class RemapS16Point5SVE2 - -#if KLEIDICV_TARGET_SME2 - -template -class RemapS16Point5SME2; - -template <> -class RemapS16Point5SME2 : public RemapS16Point5SVE2 { - public: - using ScalarType = uint8_t; - using MapVecTraits = VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; - using MapVector2Type = typename MapVecTraits::Vector2Type; - using FracVecTraits = VecTraits; - using FracVectorType = typename FracVecTraits::VectorType; - - RemapS16Point5SME2(Rows src_rows, size_t src_width, - size_t src_height, svuint16_t& v_src_stride, - MapVectorType& v_x_max, MapVectorType& v_y_max) - : RemapS16Point5SVE2(src_rows, src_width, src_height, - v_src_stride, v_x_max, v_y_max), - rowbuffer_width_{0} {} - - void process_row(size_t width, Columns mapxy, - Columns mapfrac, Columns dst) { - if (KLEIDICV_UNLIKELY(width != rowbuffer_width_)) { - if (width > rowbuffer_width_) { - rowbuffer_.reset(new ScalarType[4 * (width + 3)]); - rowbuffer_width_ = width; - } - } - - Columns demapped_src{rowbuffer_.get(), - 4 * src_rows_.channels()}; - - svuint16_t src_a, src_b, src_c, src_d; - - auto vector_path = [&](svbool_t pg, ptrdiff_t step) { - load_source(pg, step, mapxy, src_a, src_b, src_c, src_d); - - // Keep only the 8-bit data - svuint8_t src_ab = - svtrn1_u8(svreinterpret_u8_u16(src_a), svreinterpret_u8_u16(src_b)); - svuint8_t src_cd = - svtrn1_u8(svreinterpret_u8_u16(src_c), svreinterpret_u8_u16(src_d)); - - // Store them to rowbuffer - // Interleaved store makes it abcdabcd - svst2_u16(pg, reinterpret_cast(&demapped_src[0]), - svcreate2(svreinterpret_u16_u8(src_ab), - svreinterpret_u16_u8(src_cd))); - demapped_src += step; - }; - - LoopUnroll loop{width, MapVecTraits::num_lanes()}; - loop.unroll_once([&](size_t step) { - svbool_t pg = MapVecTraits::svptrue(); - vector_path(pg, static_cast(step)); - }); - loop.remaining([&](size_t length, size_t step) { - svbool_t pg = MapVecTraits::svwhilelt(step - length, step); - vector_path(pg, static_cast(length)); - }); - - process_demapped_row(mapfrac, dst); - } - - private: - KLEIDICV_LOCALLY_STREAMING void process_demapped_row( - Columns mapfrac, Columns dst) { - Columns demapped_src{rowbuffer_.get(), - 4 * src_rows_.channels()}; - - svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); - - auto vector_path = [&](svbool_t pg16, svbool_t pg8, - ptrdiff_t step) KLEIDICV_STREAMING_COMPATIBLE { - // Deinterleave abcd into two vectors, ac and bd - svuint8x2_t src = svld2_u8(pg8, &demapped_src[0]); - - svuint16_t src_a = svmovlb_u16(svget2(src, 0)); - svuint16_t src_b = svmovlb_u16(svget2(src, 1)); - svuint16_t src_c = svmovlt_u16(svget2(src, 0)); - svuint16_t src_d = svmovlt_u16(svget2(src, 1)); - - interpolate_and_store(pg16, step, mapfrac, dst, src_a, src_b, src_c, - src_d, bias); - demapped_src += step; - }; - - svbool_t ptrue_16 = FracVecTraits::svptrue(); - svbool_t ptrue_8 = svptrue_b8(); - LoopUnroll loop{rowbuffer_width_, FracVecTraits::num_lanes()}; - loop.unroll_once([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { - vector_path(ptrue_16, ptrue_8, static_cast(step)); - }); - loop.remaining([&](size_t length, size_t) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg16 = FracVecTraits::svwhilelt(size_t{0}, length); - svbool_t pg8 = svwhilelt_b8(size_t{0}, 2 * length); - vector_path(pg16, pg8, static_cast(length)); - }); - } - - size_t rowbuffer_width_; - std::unique_ptr rowbuffer_; -}; // end of class RemapS16Point5SME2 -#endif // KLEIDICV_TARGET_SME2 +}; // end of class RemapS16Point5 template kleidicv_error_t remap_s16point5_sc( @@ -436,13 +325,8 @@ kleidicv_error_t remap_s16point5_sc( svuint16_t sv_src_stride; svint16_t sv_xmax, sv_ymax; -#if KLEIDICV_TARGET_SME2 - RemapS16Point5SME2 operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; -#else - RemapS16Point5SVE2 operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; -#endif // KLEIDICV_TARGET_SME2 + RemapS16Point5 operation{src_rows, src_width, src_height, + sv_src_stride, sv_xmax, sv_ymax}; Rectangle rect{dst_width, dst_height}; zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); return KLEIDICV_OK; diff --git a/kleidicv/src/transform/remap_sme2.cpp b/kleidicv/src/transform/remap_sme2.cpp deleted file mode 100644 index 426ccda3f..000000000 --- a/kleidicv/src/transform/remap_sme2.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#include "remap_sc.h" - -namespace kleidicv::sme2 { - -template -kleidicv_error_t remap_s16point5(const T *src, size_t src_stride, - size_t src_width, size_t src_height, T *dst, - size_t dst_stride, size_t dst_width, - size_t dst_height, size_t channels, - const int16_t *mapxy, size_t mapxy_stride, - const uint16_t *mapfrac, size_t mapfrac_stride, - kleidicv_border_type_t border_type, - const T *border_value) { - return remap_s16point5_sc(src, src_stride, src_width, src_height, - dst, dst_stride, dst_width, dst_height, - channels, mapxy, mapxy_stride, mapfrac, - mapfrac_stride, border_type, border_value); -} - -#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ - template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5( \ - const type *src, size_t src_stride, size_t src_width, size_t src_height, \ - type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ - const uint16_t *mapfrac, size_t mapfrac_stride, \ - kleidicv_border_type_t border_type, const type *border_value) - -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); - -} // namespace kleidicv::sme2 -- GitLab From 38fedc2de039f587be1c9cddf1c9fd8f567474f8 Mon Sep 17 00:00:00 2001 From: Michael Platings Date: Tue, 7 Jan 2025 14:30:10 +0000 Subject: [PATCH 2/2] Implement remap constant border --- conformity/opencv/test_remap.cpp | 10 +- doc/opencv.md | 2 +- kleidicv/include/kleidicv/kleidicv.h | 1 + kleidicv/include/kleidicv/transform/remap.h | 9 +- kleidicv/src/transform/remap_neon.cpp | 342 ++++++++++++++++-- kleidicv/src/transform/remap_sc.h | 373 ++++++++++++++++---- scripts/benchmark/benchmarks.txt | 9 +- test/api/test_remap.cpp | 262 +++++++++----- test/api/test_thread.cpp | 12 +- 9 files changed, 816 insertions(+), 204 deletions(-) diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index 0455d6773..c60789cc6 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -134,9 +134,13 @@ bool test_remap_s16point5(int index, RecreatedMessageQueue& request_queue, std::vector& remap_tests_get() { // clang-format off static std::vector tests = { - TEST("RemapS16 uint8", (test_remap_s16), (exec_remap_s16)), - TEST("RemapS16 uint16", (test_remap_s16), (exec_remap_s16)), - TEST("RemapS16Point5 uint8", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16 uint8 Replicate", (test_remap_s16), (exec_remap_s16)), + TEST("RemapS16 uint16 Replicate", (test_remap_s16), (exec_remap_s16)), + TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + + TEST("RemapS16 uint8 Constant", (test_remap_s16), (exec_remap_s16)), + TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), + TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), }; // clang-format on return tests; diff --git a/doc/opencv.md b/doc/opencv.md index 478712113..5bef02c0a 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -214,7 +214,7 @@ Notes on parameters: * `src.depth()` * Supports `CV_8U` and `CV_16U` depths and 1 channel when prodiving only an integer map (`map1`). * Supports `CV_8U` depth and 1 channel when providing integer + fractional maps (`map1` and `map2`). -* `borderMode` - only supports `BORDER_REPLICATE`. \ +* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. \ Supported map configurations: * `map1` is 16SC2: channel #1 is x coordinate (column) and channel #2 is y (row) * supported `interpolation`: `INTER_NEAREST` only diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 517cd41e5..75e7b3511 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1789,6 +1789,7 @@ KLEIDICV_API_DECLARATION(kleidicv_in_range_f32, const float *src, /// @param channels Number of channels in the data. Must be 1. /// @param border_type Way of handling the border. The supported border types /// are: \n +/// - @ref KLEIDICV_BORDER_TYPE_CONSTANT /// - @ref KLEIDICV_BORDER_TYPE_REPLICATE /// @param border_value Border value if the border_type is /// @ref KLEIDICV_BORDER_TYPE_CONSTANT. diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 204766006..759470129 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -23,7 +23,10 @@ inline bool remap_s16_is_implemented( dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && - border_type == KLEIDICV_BORDER_TYPE_REPLICATE && channels == 1); + dst_width >= 8 && + (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || + border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && + channels == 1); } else { return false; } @@ -39,7 +42,9 @@ inline bool remap_s16point5_is_implemented( dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && - border_type == KLEIDICV_BORDER_TYPE_REPLICATE && channels == 1); + (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || + border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && + channels == 1); } else { return false; } diff --git a/kleidicv/src/transform/remap_neon.cpp b/kleidicv/src/transform/remap_neon.cpp index 1a799cf9e..d29200e86 100644 --- a/kleidicv/src/transform/remap_neon.cpp +++ b/kleidicv/src/transform/remap_neon.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include #include "kleidicv/kleidicv.h" @@ -11,19 +12,34 @@ namespace kleidicv::neon { template -class RemapS16 { +class RemapS16Replicate { public: using MapVecTraits = neon::VecTraits; using MapVectorType = typename MapVecTraits::VectorType; using MapVector2Type = typename MapVecTraits::Vector2Type; - RemapS16(Rows src_rows, size_t src_width, size_t src_height) + RemapS16Replicate(Rows src_rows, size_t src_width, + size_t src_height) : src_rows_{src_rows}, - v_src_element_stride{vdupq_n_u16( + v_src_element_stride_{vdupq_n_u16( static_cast(src_rows_.stride() / sizeof(ScalarType)))}, v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + void transform_pixels(uint32x4_t indices_low, uint32x4_t indices_high, + Columns dst) { + // Copy pixels from source + dst[0] = src_rows_[vgetq_lane_u32(indices_low, 0)]; + dst[1] = src_rows_[vgetq_lane_u32(indices_low, 1)]; + dst[2] = src_rows_[vgetq_lane_u32(indices_low, 2)]; + dst[3] = src_rows_[vgetq_lane_u32(indices_low, 3)]; + + dst[4] = src_rows_[vgetq_lane_u32(indices_high, 0)]; + dst[5] = src_rows_[vgetq_lane_u32(indices_high, 1)]; + dst[6] = src_rows_[vgetq_lane_u32(indices_high, 2)]; + dst[7] = src_rows_[vgetq_lane_u32(indices_high, 3)]; + } + void process_row(size_t width, Columns mapxy, Columns dst) { auto vector_path = [&](size_t step) { @@ -36,18 +52,12 @@ class RemapS16 { // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) uint32x4_t indices_low = vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), - vget_low_u16(v_src_element_stride)); - // Copy pixels from source - dst[0] = src_rows_[vgetq_lane_u32(indices_low, 0)]; - dst[1] = src_rows_[vgetq_lane_u32(indices_low, 1)]; - dst[2] = src_rows_[vgetq_lane_u32(indices_low, 2)]; - dst[3] = src_rows_[vgetq_lane_u32(indices_low, 3)]; + vget_low_u16(v_src_element_stride_)); uint32x4_t indices_high = - vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride); - dst[4] = src_rows_[vgetq_lane_u32(indices_high, 0)]; - dst[5] = src_rows_[vgetq_lane_u32(indices_high, 1)]; - dst[6] = src_rows_[vgetq_lane_u32(indices_high, 2)]; - dst[7] = src_rows_[vgetq_lane_u32(indices_high, 3)]; + vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); + + transform_pixels(indices_low, indices_high, dst); + mapxy += ptrdiff_t(step); dst += ptrdiff_t(step); }; @@ -63,11 +73,127 @@ class RemapS16 { private: Rows src_rows_; - uint16x8_t v_src_element_stride; + uint16x8_t v_src_element_stride_; int16x8_t v_xmax_; int16x8_t v_ymax_; -}; // end of class RemapS16 +}; // end of class RemapS16Replicate + +template +class RemapS16ConstantBorder { + public: + using SrcVecTraits = neon::VecTraits; + using SrcVecType = typename SrcVecTraits::VectorType; + + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + + RemapS16ConstantBorder(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_element_stride_{vdupq_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdupq_n_u16(*border_value)} {} + + void transform_pixels(uint32x4_t indices_low, uint32x4_t indices_high, + uint16x8_t in_range, Columns dst); + + void process_row(size_t width, Columns mapxy, + Columns dst) { + auto vector_path = [&](size_t step) { + MapVector2Type xy = vld2q_s16(&mapxy[0]); + + uint16x8_t x = vreinterpretq_u16_s16(xy.val[0]); + uint16x8_t y = vreinterpretq_u16_s16(xy.val[1]); + + // Find whether coordinates are within the image dimensions. + // Negative coordinates are interpreted as large values due to the + // s16->u16 reinterpretation. + uint16x8_t in_range = + vandq_u16(vcltq_u16(x, v_width_), vcltq_u16(y, v_height_)); + + // Zero out-of-range coordinates. + x = vandq_u16(in_range, x); + y = vandq_u16(in_range, y); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + uint32x4_t indices_low = + vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), + vget_low_u16(v_src_element_stride_)); + uint32x4_t indices_high = + vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); + + transform_pixels(indices_low, indices_high, in_range, dst); + + mapxy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x8_t v_src_element_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint16x8_t v_border_; +}; // end of class RemapS16ConstantBorder + +template <> +void RemapS16ConstantBorder::transform_pixels(uint32x4_t indices_low, + uint32x4_t indices_high, + uint16x8_t in_range, + Columns dst) { + uint8x8_t pixels = { + src_rows_[vgetq_lane_u32(indices_low, 0)], + src_rows_[vgetq_lane_u32(indices_low, 1)], + src_rows_[vgetq_lane_u32(indices_low, 2)], + src_rows_[vgetq_lane_u32(indices_low, 3)], + src_rows_[vgetq_lane_u32(indices_high, 0)], + src_rows_[vgetq_lane_u32(indices_high, 1)], + src_rows_[vgetq_lane_u32(indices_high, 2)], + src_rows_[vgetq_lane_u32(indices_high, 3)], + }; + // Select between source pixels and border colour + uint8x8_t pixels_or_border = + vbsl_u8(vmovn_u16(in_range), pixels, vmovn_u16(v_border_)); + + vst1_u8(&dst[0], pixels_or_border); +} +template <> +void RemapS16ConstantBorder::transform_pixels(uint32x4_t indices_low, + uint32x4_t indices_high, + uint16x8_t in_range, + Columns dst) { + uint16x8_t pixels = { + src_rows_[vgetq_lane_u32(indices_low, 0)], + src_rows_[vgetq_lane_u32(indices_low, 1)], + src_rows_[vgetq_lane_u32(indices_low, 2)], + src_rows_[vgetq_lane_u32(indices_low, 3)], + src_rows_[vgetq_lane_u32(indices_high, 0)], + src_rows_[vgetq_lane_u32(indices_high, 1)], + src_rows_[vgetq_lane_u32(indices_high, 2)], + src_rows_[vgetq_lane_u32(indices_high, 3)], + }; + + // Select between source pixels and border colour + uint16x8_t pixels_or_border = vbslq_u16(in_range, pixels, v_border_); + + vst1q_u16(&dst[0], pixels_or_border); +} + +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, @@ -80,6 +206,9 @@ kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); CHECK_IMAGE_SIZE(src_width, src_height); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } if (!remap_s16_is_implemented(src_stride, src_width, src_height, dst_width, border_type, channels)) { @@ -89,17 +218,25 @@ kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, Rows src_rows{src, src_stride, channels}; Rows mapxy_rows{mapxy, mapxy_stride, 2}; Rows dst_rows{dst, dst_stride, channels}; - RemapS16 operation{src_rows, src_width, src_height}; Rectangle rect{dst_width, dst_height}; - zip_rows(operation, rect, mapxy_rows, dst_rows); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { + RemapS16ConstantBorder operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); + RemapS16Replicate operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + } return KLEIDICV_OK; } +// NOLINTEND(readability-function-cognitive-complexity) template -class RemapS16Point5; +class RemapS16Point5Replicate; template <> -class RemapS16Point5 { +class RemapS16Point5Replicate { public: using ScalarType = uint8_t; using MapVecTraits = neon::VecTraits; @@ -108,8 +245,8 @@ class RemapS16Point5 { using FracVecTraits = neon::VecTraits; using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5(Rows src_rows, size_t src_width, - size_t src_height) + RemapS16Point5Replicate(Rows src_rows, size_t src_width, + size_t src_height) : src_rows_{src_rows}, v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, @@ -218,8 +355,152 @@ class RemapS16Point5 { uint16x4_t v_src_stride_; int16x8_t v_xmax_; int16x8_t v_ymax_; -}; // end of class RemapS16Point5 +}; // end of class RemapS16Point5Replicate + +template +class RemapS16Point5ConstantBorder; + +template <> +class RemapS16Point5ConstantBorder { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + RemapS16Point5ConstantBorder(Rows src_rows, + size_t src_width, size_t src_height, + const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_stride_{vdupq_n_u16(static_cast(src_rows_.stride()))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdup_n_u8(*border_value)} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + int16x8x2_t xy = vld2q_s16(&mapxy[0]); + uint16x8_t frac = vld1q_u16(&mapfrac[0]); + uint8x8_t frac_max = vdup_n_u8(REMAP16POINT5_FRAC_MAX); + uint8x8_t frac_mask = vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1); + uint8x8_t xfrac = vand_u8(vmovn_u16(frac), frac_mask); + uint8x8_t yfrac = + vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); + uint8x8_t nxfrac = vsub_u8(frac_max, xfrac); + uint8x8_t nyfrac = vsub_u8(frac_max, yfrac); + + uint16x8_t one = vdupq_n_u16(1); + uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]); + uint16x8_t y0 = vreinterpretq_u16_s16(xy.val[1]); + uint16x8_t x1 = vaddq_u16(x0, one); + uint16x8_t y1 = vaddq_u16(y0, one); + + uint8x8_t v00 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0); + uint8x8_t v01 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1); + uint8x8_t v10 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0); + uint8x8_t v11 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1); + + uint8x8_t result = interpolate(v00, v01, v10, v11, xfrac, vmovl_u8(yfrac), + nxfrac, vmovl_u8(nyfrac)); + + vst1_u8(&dst[0], result); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + uint8x8_t load_pixels_or_constant_border(Rows &src_rows_, + uint16x8_t v_src_element_stride_, + uint16x8_t v_width_, + uint16x8_t v_height_, + uint8x8_t v_border_, uint16x8_t x, + uint16x8_t y) { + // Find whether coordinates are within the image dimensions. + // Negative coordinates are interpreted as large values due to the s16->u16 + // reinterpretation. + uint16x8_t in_range = + vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_), + vcltq_u16(vreinterpretq_u16_s16(y), v_height_)); + + // Zero out-of-range coordinates. + x = vandq_u16(in_range, x); + y = vandq_u16(in_range, y); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + uint32x4_t indices_low = + vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), + vget_low_u16(v_src_element_stride_)); + uint32x4_t indices_high = + vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); + + // Read pixels from source + uint8x8_t pixels = { + src_rows_[vgetq_lane_u32(indices_low, 0)], + src_rows_[vgetq_lane_u32(indices_low, 1)], + src_rows_[vgetq_lane_u32(indices_low, 2)], + src_rows_[vgetq_lane_u32(indices_low, 3)], + src_rows_[vgetq_lane_u32(indices_high, 0)], + src_rows_[vgetq_lane_u32(indices_high, 1)], + src_rows_[vgetq_lane_u32(indices_high, 2)], + src_rows_[vgetq_lane_u32(indices_high, 3)], + }; + // Select between source pixels and border colour + return vbsl_u8(vmovn_u16(in_range), pixels, v_border_); + } + + uint8x8_t interpolate(uint8x8_t v00, uint8x8_t v01, uint8x8_t v10, + uint8x8_t v11, uint8x8_t xfrac, uint16x8_t yfrac, + uint8x8_t nxfrac, uint16x8_t nyfrac) { + auto interpolate_horizontal = [&](uint8x8_t left, uint8x8_t right) { + return vmlal_u8(vmull_u8(nxfrac, left), xfrac, right); + }; + + // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint16x4_t a, uint16x4_t b, uint16x4_t frac, + uint16x4_t nfrac) { + uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, a, nfrac), b, frac); + return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint16x8_t line0 = interpolate_horizontal(v00, v10); + uint16x8_t line1 = interpolate_horizontal(v01, v11); + + uint16x4_t lo = + interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1), + vget_low_u16(yfrac), vget_low_u16(nyfrac)); + uint16x4_t hi = + interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1), + vget_high_u16(yfrac), vget_high_u16(nyfrac)); + + // Discard upper 8 bits of each element and combine low and high parts into + // a single register. + return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi)); + } + + Rows src_rows_; + uint16x8_t v_src_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint8x8_t v_border_; +}; // end of class RemapS16Point5ConstantBorder + +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t remap_s16point5( const T *src, size_t src_stride, size_t src_width, size_t src_height, @@ -234,6 +515,9 @@ kleidicv_error_t remap_s16point5( CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height); CHECK_IMAGE_SIZE(src_width, src_height); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } if (!remap_s16point5_is_implemented(src_stride, src_width, src_height, dst_width, border_type, channels)) { @@ -244,11 +528,19 @@ kleidicv_error_t remap_s16point5( Rows mapxy_rows{mapxy, mapxy_stride, 2}; Rows mapfrac_rows{mapfrac, mapfrac_stride, 1}; Rows dst_rows{dst, dst_stride, channels}; - RemapS16Point5 operation{src_rows, src_width, src_height}; Rectangle rect{dst_width, dst_height}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { + RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); + RemapS16Point5Replicate operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } return KLEIDICV_OK; } +// NOLINTEND(readability-function-cognitive-complexity) #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16( \ diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index 00fd7de13..b6311ac7f 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -20,27 +21,27 @@ namespace KLEIDICV_TARGET_NAMESPACE { template -class RemapS16 { +class RemapS16Replicate { public: using MapVecTraits = VecTraits; using MapVectorType = typename MapVecTraits::VectorType; using MapVector2Type = typename MapVecTraits::Vector2Type; - RemapS16(Rows src_rows, size_t src_width, size_t src_height, - svuint16_t& v_src_stride, MapVectorType& v_x_max, - MapVectorType& v_y_max) + RemapS16Replicate(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_element_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) : src_rows_{src_rows}, - v_src_element_stride{v_src_stride}, + v_src_element_stride_{v_src_element_stride}, v_xmax_{v_x_max}, v_ymax_{v_y_max} { - v_src_element_stride = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); v_xmax_ = svdup_s16(static_cast(src_width - 1)); v_ymax_ = svdup_s16(static_cast(src_height - 1)); } - void transform_pixels(svuint32_t offsets_b, svbool_t pg_b, + void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, svbool_t pg_t, - Columns dst, svbool_t pg_dst); + Columns dst); void process_row(size_t width, Columns mapxy, Columns dst) { @@ -54,13 +55,8 @@ class RemapS16 { svuint16_t y = svreinterpret_u16_s16( svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_))); // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) - offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride); - offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride); - // Account for the size of the source type when calculating offset - if constexpr (std::is_same::value) { - offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); - offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); - } + offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); + offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); }; svbool_t pg_all16 = MapVecTraits::svptrue(); @@ -70,7 +66,7 @@ class RemapS16 { load_offsets(pg); svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); - transform_pixels(offsets_b, pg_b, offsets_t, pg_t, dst, pg); + transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, dst); mapxy += step; dst += step; }; @@ -78,7 +74,7 @@ class RemapS16 { // NOTE: gather load is not available in streaming mode auto gather_load_full_vector_path = [&](ptrdiff_t step) { load_offsets(pg_all16); - transform_pixels(offsets_b, pg_all32, offsets_t, pg_all32, dst, pg_all16); + transform_pixels(pg_all16, offsets_b, pg_all32, offsets_t, pg_all32, dst); mapxy += step; dst += step; }; @@ -95,16 +91,15 @@ class RemapS16 { private: Rows src_rows_; - svuint16_t& v_src_element_stride; + svuint16_t& v_src_element_stride_; MapVectorType& v_xmax_; MapVectorType& v_ymax_; -}; // end of class RemapS16 +}; // end of class RemapS16Replicate template <> -void RemapS16::transform_pixels(svuint32_t offsets_b, svbool_t pg_b, - svuint32_t offsets_t, svbool_t pg_t, - Columns dst, - svbool_t pg_dst) { +void RemapS16Replicate::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, Columns dst) { // Copy pixels from source svuint32_t result_b = svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); @@ -112,14 +107,18 @@ void RemapS16::transform_pixels(svuint32_t offsets_b, svbool_t pg_b, svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), svreinterpret_u16_u32(result_t)); - svst1b_u16(pg_dst, &dst[0], result); + + svst1b_u16(pg, &dst[0], result); } template <> -void RemapS16::transform_pixels(svuint32_t offsets_b, svbool_t pg_b, - svuint32_t offsets_t, svbool_t pg_t, - Columns dst, - svbool_t pg_dst) { +void RemapS16Replicate::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, Columns dst) { + // Account for the size of the source type when calculating offset + offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); + offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); + // Copy pixels from source svuint32_t result_b = svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); @@ -127,21 +126,121 @@ void RemapS16::transform_pixels(svuint32_t offsets_b, svbool_t pg_b, svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), svreinterpret_u16_u32(result_t)); - svst1_u16(pg_dst, &dst[0], result); + + svst1_u16(pg, &dst[0], result); +} + +template +class RemapS16ConstantBorder { + public: + RemapS16ConstantBorder(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType* border_value, + svuint16_t& v_src_element_stride, svuint16_t& v_width, + svuint16_t& v_height, svuint16_t& v_border) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_element_stride}, + v_width_{v_width}, + v_height_{v_height}, + v_border_{v_border} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_width_ = svdup_u16(static_cast(src_width)); + v_height_ = svdup_u16(static_cast(src_height)); + v_border_ = svdup_u16(*border_value); + } + + void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, + svuint32_t offsets_t, svbool_t pg_t, ScalarType* dst); + + void process_row(size_t width, Columns mapxy, + Columns dst) { + for (size_t i = 0; i < width; i += svcnth()) { + svbool_t pg = svwhilelt_b16(i, width); + + svint16x2_t xy = svld2_s16(pg, &mapxy[static_cast(i * 2)]); + svuint16_t x = svreinterpret_u16_s16(svget2(xy, 0)); + svuint16_t y = svreinterpret_u16_s16(svget2(xy, 1)); + + // Find whether coordinates are within the image dimensions. + svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), + svcmplt_u16(pg, y, v_height_)); + svbool_t pg_b = in_range; + svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + svuint32_t offsets_b = + svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); + svuint32_t offsets_t = + svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); + + transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, + &dst[static_cast(i)]); + } + } + + private: + Rows src_rows_; + svuint16_t& v_src_element_stride_; + svuint16_t& v_width_; + svuint16_t& v_height_; + svuint16_t& v_border_; +}; // end of class RemapS16ConstantBorder + +template <> +void RemapS16ConstantBorder::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, uint8_t* dst) { + // Copy pixels from source + svuint32_t result_b = + svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + svuint16_t result_selected = svsel(pg_b, result, v_border_); + svst1b_u16(pg, dst, result_selected); } +template <> +void RemapS16ConstantBorder::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, uint16_t* dst) { + // Account for the size of the source type when calculating offset + offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); + offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); + + // Copy pixels from source + svuint32_t result_b = + svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + svuint16_t result_selected = svsel(pg_b, result, v_border_); + svst1_u16(pg, dst, result_selected); +} + +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t remap_s16_sc(const T* src, size_t src_stride, size_t src_width, size_t src_height, T* dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, const int16_t* mapxy, size_t mapxy_stride, - kleidicv_border_type_t border_type, const T*) { + kleidicv_border_type_t border_type, + const T* border_value) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); CHECK_IMAGE_SIZE(src_width, src_height); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } if (!remap_s16_is_implemented(src_stride, src_width, src_height, dst_width, border_type, channels)) { @@ -151,20 +250,61 @@ kleidicv_error_t remap_s16_sc(const T* src, size_t src_stride, size_t src_width, Rows src_rows{src, src_stride, channels}; Rows mapxy_rows{mapxy, mapxy_stride, 2}; Rows dst_rows{dst, dst_stride, channels}; - svuint16_t sv_src_stride; - svint16_t sv_xmax, sv_ymax; - RemapS16 operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; + svuint16_t sv_src_element_stride; Rectangle rect{dst_width, dst_height}; - zip_rows(operation, rect, mapxy_rows, dst_rows); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { + svuint16_t sv_width, sv_height, sv_border; + RemapS16ConstantBorder operation{ + src_rows, src_width, src_height, border_value, sv_src_element_stride, + sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); + svint16_t sv_xmax, sv_ymax; + RemapS16Replicate operation{src_rows, src_width, + src_height, sv_src_element_stride, + sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + } return KLEIDICV_OK; } +// NOLINTEND(readability-function-cognitive-complexity) template -class RemapS16Point5; +inline svuint16_t interpolate_16point5(svbool_t pg, svuint16_t frac, + svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, + svuint32_t bias); template <> -class RemapS16Point5 { +inline svuint16_t interpolate_16point5( + svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t nxfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); + svuint16_t nyfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); + svuint16_t line0 = svmla_x(pg, svmul_x(pg, xfrac, src_b), nxfrac, src_a); + svuint16_t line1 = svmla_x(pg, svmul_x(pg, xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); + svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); + acc_b = svmlalb_u32(acc_b, line1, yfrac); + acc_t = svmlalt_u32(acc_t, line1, yfrac); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); +} + +template +class RemapS16Point5Replicate; + +template <> +class RemapS16Point5Replicate { public: using ScalarType = uint8_t; using MapVecTraits = VecTraits; @@ -173,18 +313,16 @@ class RemapS16Point5 { using FracVecTraits = VecTraits; using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5(Rows src_rows, size_t src_width, - size_t src_height, svuint16_t& v_src_stride, - MapVectorType& v_x_max, MapVectorType& v_y_max) + RemapS16Point5Replicate(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) : src_rows_{src_rows}, v_src_stride_{v_src_stride}, v_xmax_{v_x_max}, v_ymax_{v_y_max} { v_src_stride_ = svdup_u16(src_rows.stride()); - v_xmax_ = svdup_s16(static_cast( - std::min(std::numeric_limits::max(), src_width - 1))); - v_ymax_ = svdup_s16(static_cast( - std::min(std::numeric_limits::max(), src_height - 1))); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); } void process_row(size_t width, Columns mapxy, @@ -264,28 +402,10 @@ class RemapS16Point5 { Columns& mapfrac, Columns& dst, svuint16_t src_a, svuint16_t src_b, svuint16_t src_c, - svuint16_t src_d, - svuint32_t bias) KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t src_d, svuint32_t bias) { FracVectorType frac = svld1_u16(pg, &mapfrac[0]); - svuint16_t xfrac = - svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); - svuint16_t yfrac = - svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), - svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); - svuint16_t nxfrac = - svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); - svuint16_t nyfrac = - svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); - svuint16_t line0 = svmla_x(pg, svmul_x(pg, xfrac, src_b), nxfrac, src_a); - svuint16_t line1 = svmla_x(pg, svmul_x(pg, xfrac, src_d), nxfrac, src_c); - - svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); - svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); - acc_b = svmlalb_u32(acc_b, line1, yfrac); - acc_t = svmlalt_u32(acc_t, line1, yfrac); - - svuint16_t result = svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), - acc_t, 2ULL * REMAP16POINT5_FRAC_BITS); + svuint16_t result = interpolate_16point5(pg, frac, src_a, src_b, + src_c, src_d, bias); svst1b_u16(pg, &dst[0], result); mapfrac += step; dst += step; @@ -297,21 +417,120 @@ class RemapS16Point5 { svuint16_t& v_src_stride_; MapVectorType& v_xmax_; MapVectorType& v_ymax_; -}; // end of class RemapS16Point5 +}; // end of class RemapS16Point5Replicate + +template +class RemapS16Point5ConstantBorder; + +template <> +class RemapS16Point5ConstantBorder { + public: + using ScalarType = uint8_t; + + RemapS16Point5ConstantBorder(Rows src_rows, + size_t src_width, size_t src_height, + const ScalarType* border_value, + svuint16_t& v_src_stride, svuint16_t& v_width, + svuint16_t& v_height, svuint16_t& v_border) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_width_{v_width}, + v_height_{v_height}, + v_border_{v_border} { + v_src_stride_ = svdup_u16(src_rows.stride()); + v_width_ = svdup_u16(static_cast(src_width)); + v_height_ = svdup_u16(static_cast(src_height)); + v_border_ = svdup_u16(*border_value); + } + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + svuint16_t one = svdup_n_u16(1); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + for (size_t i = 0; i < width; i += svcnth()) { + svbool_t pg = svwhilelt_b16(i, width); + + svuint16x2_t xy = + svld2_u16(pg, reinterpret_cast( + &mapxy[static_cast(i * 2)])); + + svuint16_t x0 = svget2(xy, 0); + svuint16_t y0 = svget2(xy, 1); + svuint16_t x1 = svadd_x(pg, x0, one); + svuint16_t y1 = svadd_x(pg, y0, one); + + svuint16_t v00 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y0); + svuint16_t v01 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y1); + svuint16_t v10 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y0); + svuint16_t v11 = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y1); + + svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast(i)]); + svuint16_t result = + interpolate_16point5(pg, frac, v00, v10, v01, v11, bias); + + svst1b_u16(pg, &dst[static_cast(i)], result); + } + } + + private: + svuint16_t load_pixels_or_constant_border(Rows src_rows_, + svuint16_t& v_src_stride_, + svuint16_t& v_width_, + svuint16_t& v_height_, + svuint16_t& v_border_, svbool_t pg, + svuint16_t x, svuint16_t y) { + // Find whether coordinates are within the image dimensions. + svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), + svcmplt_u16(pg, y, v_height_)); + + // Calculate offsets from coordinates (y * stride + x) + svuint32_t offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_stride_); + svuint32_t offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_stride_); + + svbool_t pg_b = in_range; + svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); + + // Copy pixels from source + svuint32_t result_b = + svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + return svsel(in_range, result, v_border_); + } + + Rows src_rows_; + svuint16_t& v_src_stride_; + svuint16_t& v_width_; + svuint16_t& v_height_; + svuint16_t& v_border_; +}; // end of class RemapS16Point5ConstantBorder + +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t remap_s16point5_sc( const T* src, size_t src_stride, size_t src_width, size_t src_height, T* dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, const int16_t* mapxy, size_t mapxy_stride, const uint16_t* mapfrac, size_t mapfrac_stride, - kleidicv_border_type_t border_type, const T*) { + kleidicv_border_type_t border_type, const T* border_value) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height); CHECK_IMAGE_SIZE(src_width, src_height); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } if (!remap_s16point5_is_implemented(src_stride, src_width, src_height, dst_width, border_type, channels)) { @@ -323,14 +542,24 @@ kleidicv_error_t remap_s16point5_sc( Rows mapfrac_rows{mapfrac, mapfrac_stride, 1}; Rows dst_rows{dst, dst_stride, channels}; svuint16_t sv_src_stride; - svint16_t sv_xmax, sv_ymax; - - RemapS16Point5 operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; Rectangle rect{dst_width, dst_height}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { + svuint16_t sv_width, sv_height, sv_border; + RemapS16Point5ConstantBorder operation{ + src_rows, src_width, src_height, border_value, + sv_src_stride, sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); + svint16_t sv_xmax, sv_ymax; + RemapS16Point5Replicate operation{src_rows, src_width, src_height, + sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } return KLEIDICV_OK; } +// NOLINTEND(readability-function-cognitive-complexity) } // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 4eb782d1c..1c5b5376b 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -78,9 +78,12 @@ CompareGt: opencv_perf_core '*compare/*' '($PIXEL_FORMAT, 8UC1, CMP_GT)' InRange_U8: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 8UC1, 1, 2)' InRange_F32: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 32FC1, 1, 2)' -Remap_S16_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' -Remap_S16_U16: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' -Remap_S16Point5_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' +Remap_S16_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' +Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' +Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' +Remap_S16_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' +Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' +Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' WarpPerspectiveNear_Nearest: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)' diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index c6ffdbf33..838f8a2c3 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -16,29 +16,53 @@ KLEIDICV_REMAP_S16(uint8_t, u8); KLEIDICV_REMAP_S16(uint16_t, u16); +template +static const ScalarType *get_array2d_element_or_border( + const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, + kleidicv_border_type_t border_type, const ScalarType *border_value) { + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { + x = std::clamp(x, 0, static_cast(src.width()) - 1); + y = std::clamp(y, 0, static_cast(src.height()) - 1); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); + if (x >= static_cast(src.width()) || + y >= static_cast(src.height()) || x < 0 || y < 0) { + return border_value; + } + } + return src.at(y, x * src.channels()); +} + template class RemapS16 : public testing::Test { public: static void test_random(size_t src_w, size_t src_h, size_t dst_w, - size_t dst_h, size_t channels, size_t padding) { + size_t dst_h, size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { test::Array2D mapxy{2 * dst_w, dst_h, padding, 2}; test::PseudoRandomNumberGenerator coord_generator; mapxy.fill(coord_generator); - execute_test(mapxy, src_w, src_h, dst_w, dst_h, channels, padding); + execute_test(mapxy, src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); } static void test_outside_random(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { test::Array2D mapxy{2 * dst_w, dst_h, padding, 2}; test::PseudoRandomNumberGeneratorIntRange coord_generator{ static_cast(-src_w), static_cast(2 * src_w)}; mapxy.fill(coord_generator); - execute_test(mapxy, src_w, src_h, dst_w, dst_h, channels, padding); + execute_test(mapxy, src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); } static void test_blend(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - size_t channels, size_t padding) { + size_t channels, kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { test::Array2D mapxy{2 * dst_w, dst_h, padding, 2}; for (size_t row = 0; row < dst_h; ++row) { for (size_t column = 0; column < dst_w; ++column) { @@ -53,12 +77,16 @@ class RemapS16 : public testing::Test { 4 * row / dst_h))); } } - execute_test(mapxy, src_w, src_h, dst_w, dst_h, channels, padding); + execute_test(mapxy, src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); } // Test coordinates with edge values that may easily overflow static void test_corner_cases(size_t src_w, size_t src_h, size_t dst_w, - size_t dst_h, size_t channels, size_t padding) { + size_t dst_h, size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, + size_t padding) { test::Array2D mapxy{2 * dst_w, dst_h, padding, 2}; // One more y than x so we'll see many combinations const int16_t corner_x_values[] = {-32768, @@ -96,17 +124,16 @@ class RemapS16 : public testing::Test { test::Array2D expected{dst_total_width, dst_h, padding, channels}; - test::PseudoRandomNumberGenerator generator; actual.fill(42); - calculate_expected(source, mapxy, expected); + calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, mapxy.data(), - mapxy.stride(), KLEIDICV_BORDER_TYPE_REPLICATE, {})); + ASSERT_EQ( + KLEIDICV_OK, + remap_s16()( + source.data(), source.stride(), source.width(), source.height(), + actual.data(), actual.stride(), actual.width(), actual.height(), + channels, mapxy.data(), mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -114,7 +141,8 @@ class RemapS16 : public testing::Test { private: static void execute_test(test::Array2D &mapxy, size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - size_t channels, size_t padding) { + size_t channels, kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -127,32 +155,34 @@ class RemapS16 : public testing::Test { source.fill(generator); actual.fill(42); - calculate_expected(source, mapxy, expected); + calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, mapxy.data(), - mapxy.stride(), KLEIDICV_BORDER_TYPE_REPLICATE, {})); + ASSERT_EQ( + KLEIDICV_OK, + remap_s16()( + source.data(), source.stride(), source.width(), source.height(), + actual.data(), actual.stride(), actual.width(), actual.height(), + channels, mapxy.data(), mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } static void calculate_expected(test::Array2D &src, test::Array2D &mapxy, + kleidicv_border_type_t border_type, + const ScalarType *border_value, test::Array2D &expected) { + auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { + return get_array2d_element_or_border(src, x, y, border_type, + border_value); + }; + for (size_t row = 0; row < expected.height(); row++) { for (size_t column = 0; column < expected.width() / src.channels(); ++column) { for (size_t ch = 0; ch < src.channels(); ++ch) { - int16_t y = std::max( - 0, std::min(src.height() - 1, - *mapxy.at(row, column * 2 + 1))); - int16_t x = std::max( - 0, - std::min(src.width() - 1, *mapxy.at(row, column * 2))); - *expected.at(row, column * src.channels() + ch) = - *src.at(y, x * src.channels() + ch); + const int16_t *coords = mapxy.at(row, column * 2); + int16_t x = coords[0], y = coords[1]; + *expected.at(row, column * src.channels() + ch) = get_src(x, y)[ch]; } } } @@ -162,6 +192,18 @@ class RemapS16 : public testing::Test { using RemapElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapS16, RemapElementTypes); +template +static const auto &get_borders() { + using P = std::pair; + static const T border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {4, 5, 6, 7}; + static const std::array borders{ + P{KLEIDICV_BORDER_TYPE_REPLICATE, nullptr}, + P{KLEIDICV_BORDER_TYPE_REPLICATE, border_value}, + P{KLEIDICV_BORDER_TYPE_CONSTANT, border_value}, + }; + return borders; +} + TYPED_TEST(RemapS16, RandomNoPadding) { size_t src_w = 3 * test::Options::vector_lanes() - 1; size_t src_h = 4; @@ -169,7 +211,10 @@ TYPED_TEST(RemapS16, RandomNoPadding) { size_t dst_h = src_h; size_t channels = 1; size_t padding = 0; - TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); + } } TYPED_TEST(RemapS16, OutsideRandomPadding) { @@ -179,8 +224,10 @@ TYPED_TEST(RemapS16, OutsideRandomPadding) { size_t dst_h = src_h; size_t channels = 1; size_t padding = 13; - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, - padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); + } } TYPED_TEST(RemapS16, BlendPadding) { @@ -190,7 +237,10 @@ TYPED_TEST(RemapS16, BlendPadding) { size_t dst_h = src_h; size_t channels = 1; size_t padding = 13; - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); + } } TYPED_TEST(RemapS16, BlendBigStride) { @@ -200,17 +250,23 @@ TYPED_TEST(RemapS16, BlendBigStride) { size_t dst_h = src_h; size_t channels = 1; size_t padding = std::numeric_limits::max() - src_w; - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); + } } TYPED_TEST(RemapS16, CornerCases) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; + size_t src_w = std::numeric_limits::max() + 1; + size_t src_h = std::numeric_limits::max() + 1; + size_t dst_w = 3 * test::Options::vector_lanes() - 1; + size_t dst_h = 4; size_t channels = 1; size_t padding = 17; - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); + } } TYPED_TEST(RemapS16, NullPointer) { @@ -219,7 +275,7 @@ TYPED_TEST(RemapS16, NullPointer) { int16_t mapxy[2] = {}; test::test_null_args(remap_s16(), src, 2 * sizeof(TypeParam), 2, 2, dst, 1 * sizeof(TypeParam), 1, 1, 1, mapxy, 4, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); + KLEIDICV_BORDER_TYPE_CONSTANT, src); } TYPED_TEST(RemapS16, ZeroImageSize) { @@ -311,7 +367,7 @@ TYPED_TEST(RemapS16, UnsupportedTwoChannels) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(RemapS16, UnsupportedBorderTypeConst) { +TYPED_TEST(RemapS16, UnsupportedBorderType) { const TypeParam src[1] = {}; TypeParam dst[8]; int16_t mapxy[16] = {}; @@ -319,7 +375,7 @@ TYPED_TEST(RemapS16, UnsupportedBorderTypeConst) { EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_s16()(src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, - KLEIDICV_BORDER_TYPE_CONSTANT, nullptr)); + KLEIDICV_BORDER_TYPE_REFLECT, src)); } TYPED_TEST(RemapS16, UnsupportedTooSmallImage) { @@ -341,18 +397,23 @@ class RemapS16Point5 : public testing::Test { static const uint16_t FRAC_MAX_SQUARE = FRAC_MAX * FRAC_MAX; static void test_random(size_t src_w, size_t src_h, size_t dst_w, - size_t dst_h, size_t channels, size_t padding) { + size_t dst_h, size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { test::Array2D mapxy(2 * dst_w, dst_h, padding, 2); test::PseudoRandomNumberGenerator coord_generator; mapxy.fill(coord_generator); test::Array2D mapfrac(dst_w, dst_h, padding); test::PseudoRandomNumberGenerator frac_generator; mapfrac.fill(frac_generator); - execute_test(mapxy, mapfrac, src_w, src_h, dst_w, dst_h, channels, padding); + execute_test(mapxy, mapfrac, src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); } static void test_outside_random(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { test::Array2D mapxy(2 * dst_w, dst_h, padding, 2); test::PseudoRandomNumberGeneratorIntRange coord_generator{ @@ -362,11 +423,13 @@ class RemapS16Point5 : public testing::Test { test::PseudoRandomNumberGeneratorIntRange frac_generator( 0, static_cast(FRAC_MAX_SQUARE * 3 / 2)); mapfrac.fill(frac_generator); - execute_test(mapxy, mapfrac, src_w, src_h, dst_w, dst_h, channels, padding); + execute_test(mapxy, mapfrac, src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); } static void test_blend(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - size_t channels, size_t padding) { + size_t channels, kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { test::Array2D mapxy{2 * dst_w, dst_h, padding, 2}; test::Array2D mapfrac(dst_w, dst_h, padding); for (size_t row = 0; row < dst_h; ++row) { @@ -388,12 +451,16 @@ class RemapS16Point5 : public testing::Test { << FRAC_BITS); } } - execute_test(mapxy, mapfrac, src_w, src_h, dst_w, dst_h, channels, padding); + execute_test(mapxy, mapfrac, src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); } // Test coordinates with edge values that may easily overflow static void test_corner_cases(size_t src_w, size_t src_h, size_t dst_w, - size_t dst_h, size_t channels, size_t padding) { + size_t dst_h, size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, + size_t padding) { test::Array2D mapxy{2 * dst_w, dst_h, padding, 2}; test::Array2D mapfrac(dst_w, dst_h, padding); // One more y than x so we'll see many combinations @@ -436,18 +503,17 @@ class RemapS16Point5 : public testing::Test { test::Array2D expected{dst_total_width, dst_h, padding, channels}; - test::PseudoRandomNumberGenerator generator; actual.fill(42); - calculate_expected(source, mapxy, mapfrac, expected); + calculate_expected(source, mapxy, mapfrac, border_type, border_value, + expected); - ASSERT_EQ( - KLEIDICV_OK, - kleidicv_remap_s16point5_u8( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), KLEIDICV_BORDER_TYPE_REPLICATE, {})); + ASSERT_EQ(KLEIDICV_OK, kleidicv_remap_s16point5_u8( + source.data(), source.stride(), source.width(), + source.height(), actual.data(), actual.stride(), + actual.width(), actual.height(), channels, + mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -456,7 +522,8 @@ class RemapS16Point5 : public testing::Test { static void execute_test(test::Array2D &mapxy, test::Array2D &mapfrac, size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - size_t channels, size_t padding) { + size_t channels, kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding) { size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -468,15 +535,15 @@ class RemapS16Point5 : public testing::Test { source.fill(generator); actual.fill(42); - calculate_expected(source, mapxy, mapfrac, expected); + calculate_expected(source, mapxy, mapfrac, border_type, border_value, + expected); - ASSERT_EQ( - KLEIDICV_OK, - kleidicv_remap_s16point5_u8( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), KLEIDICV_BORDER_TYPE_REPLICATE, {})); + ASSERT_EQ(KLEIDICV_OK, kleidicv_remap_s16point5_u8( + source.data(), source.stride(), source.width(), + source.height(), actual.data(), actual.stride(), + actual.width(), actual.height(), channels, + mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -494,7 +561,14 @@ class RemapS16Point5 : public testing::Test { static void calculate_expected(test::Array2D &src, test::Array2D &mapxy, test::Array2D &mapfrac, + kleidicv_border_type_t border_type, + const ScalarType *border_value, test::Array2D &expected) { + auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { + return get_array2d_element_or_border(src, x, y, border_type, + border_value); + }; + for (size_t row = 0; row < expected.height(); row++) { for (size_t column = 0; column < expected.width() / src.channels(); ++column) { @@ -507,24 +581,11 @@ class RemapS16Point5 : public testing::Test { uint8_t y_frac = (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) - int16_t x0 = - std::min(src.width() - 1, *mapxy.at(row, column * 2)); - int16_t y0 = std::min(src.height() - 1, - *mapxy.at(row, column * 2 + 1)); - if (x0 < 0) { - x0 = 0, x_frac = 0; - } - if (y0 < 0) { - y0 = 0, y_frac = 0; - } - int16_t x1 = static_cast(x0) >= src.width() - 1 ? x0 : x0 + 1; - int16_t y1 = - static_cast(y0) >= src.height() - 1 ? y0 : y0 + 1; + const int16_t *coords = mapxy.at(row, column * 2); + int16_t x = coords[0], y = coords[1]; *expected.at(row, column * src.channels() + ch) = - lerp2d(x_frac, y_frac, *src.at(y0, x0 * src.channels() + ch), - *src.at(y0, x1 * src.channels() + ch), - *src.at(y1, x0 * src.channels() + ch), - *src.at(y1, x1 * src.channels() + ch)); + lerp2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], + get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); } } } @@ -539,7 +600,10 @@ TYPED_TEST(RemapS16Point5, RandomNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test_random(src_w, src_h, dst_w, dst_h, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_random(src_w, src_h, dst_w, dst_h, 1, border_type, + border_value, 0); + } } TYPED_TEST(RemapS16Point5, BlendPadding) { @@ -547,7 +611,10 @@ TYPED_TEST(RemapS16Point5, BlendPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, 1, 13); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, 1, border_type, + border_value, 13); + } } TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { @@ -555,7 +622,10 @@ TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, 1, 13); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, 1, border_type, + border_value, 13); + } } TYPED_TEST(RemapS16Point5, BlendBigStride) { @@ -565,7 +635,10 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) { size_t dst_h = src_h; size_t channels = 1; size_t padding = std::numeric_limits::max() - src_w; - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); + } } TYPED_TEST(RemapS16Point5, CornerCases) { @@ -573,7 +646,10 @@ TYPED_TEST(RemapS16Point5, CornerCases) { size_t src_h = std::numeric_limits::max() + 1; size_t dst_w = 3 * test::Options::vector_lanes() - 1; size_t dst_h = 4; - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, 1, 17); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, 1, border_type, + border_value, 17); + } } TYPED_TEST(RemapS16Point5, NullPointer) { @@ -582,8 +658,8 @@ TYPED_TEST(RemapS16Point5, NullPointer) { int16_t mapxy[2] = {}; uint16_t mapfrac[1] = {}; test::test_null_args(kleidicv_remap_s16point5_u8, src, 2, 2, 2, dst, 1, 1, 1, - 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr); + 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_CONSTANT, + src); } TYPED_TEST(RemapS16Point5, ZeroImageSize) { @@ -655,7 +731,7 @@ TYPED_TEST(RemapS16Point5, UnsupportedTwoChannels) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(RemapS16Point5, UnsupportedBorderTypeConst) { +TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { const TypeParam src[1] = {}; TypeParam dst[8]; int16_t mapxy[16] = {}; @@ -664,7 +740,7 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderTypeConst) { EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, kleidicv_remap_s16point5_u8(src, 1, 1, 1, dst, 8, 8, 1, 1, mapxy, 4, mapfrac, 2, - KLEIDICV_BORDER_TYPE_CONSTANT, src)); + KLEIDICV_BORDER_TYPE_REFLECT, src)); } TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) { diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 27f6a6ba4..6556f78ad 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -615,16 +615,18 @@ TEST_P(Thread, remap_s16_u8_not_implemented) { KLEIDICV_BORDER_TYPE_REPLICATE, border_value); check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u8, 1, - KLEIDICV_BORDER_TYPE_CONSTANT, + KLEIDICV_BORDER_TYPE_REFLECT, border_value); } TEST_P(Thread, remap_s16_u16_not_implemented) { + const uint16_t border_value[4] = {}; check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u16, 2, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr); - check_remap_s16_not_implemented( - kleidicv_thread_remap_s16_u16, 1, KLEIDICV_BORDER_TYPE_CONSTANT, nullptr); + border_value); + check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u16, 1, + KLEIDICV_BORDER_TYPE_REFLECT, + border_value); } TEST_P(Thread, remap_s16point5_u8_border_replicate) { @@ -639,7 +641,7 @@ TEST_P(Thread, remap_s16point5_u8_not_implemented) { kleidicv_thread_remap_s16point5_u8, 2, KLEIDICV_BORDER_TYPE_REPLICATE, border_value); check_remap_s16point5_not_implemented( - kleidicv_thread_remap_s16point5_u8, 1, KLEIDICV_BORDER_TYPE_CONSTANT, + kleidicv_thread_remap_s16point5_u8, 1, KLEIDICV_BORDER_TYPE_REFLECT, border_value); } -- GitLab