From cc99871a56c0dbd00aacd4ab00e25e9cbd301bc4 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Tue, 11 Feb 2025 10:27:03 +0000 Subject: [PATCH 1/2] Implement RemapF32 for u16 data, for Neon --- adapters/opencv/kleidicv_hal.cpp | 18 +- adapters/opencv/kleidicv_hal.h | 13 +- adapters/opencv/opencv-4.11.patch | 19 +- conformity/opencv/test_remap.cpp | 15 +- kleidicv/include/kleidicv/kleidicv.h | 16 +- kleidicv/include/kleidicv/transform/remap.h | 11 +- kleidicv/src/transform/common_sc.h | 8 +- kleidicv/src/transform/remap_api.cpp | 4 + kleidicv/src/transform/remap_neon.cpp | 310 +++++++++++++++++- kleidicv/src/transform/remap_sc.h | 7 +- kleidicv/src/transform/remap_sve2.cpp | 17 +- .../src/transform/warp_perspective_neon.cpp | 4 +- kleidicv/src/transform/warp_perspective_sc.h | 2 +- .../include/kleidicv_thread/kleidicv_thread.h | 14 +- kleidicv_thread/src/kleidicv_thread.cpp | 27 +- test/api/test_remap.cpp | 245 ++++++++------ test/api/test_thread.cpp | 27 ++ 17 files changed, 596 insertions(+), 161 deletions(-) diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index c5b82b23b..e73c350fa 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1379,8 +1379,9 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, int remap_f32(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, - int dst_width, int dst_height, float *mapx, size_t mapx_step, - float *mapy, size_t mapy_step, int interpolation, int border_type, + int dst_width, int dst_height, const float *mapx, + size_t mapx_step, const float *mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value_f64[4]) { kleidicv_border_type_t kleidicv_border_type; if (from_opencv(border_type, kleidicv_border_type)) { @@ -1392,18 +1393,25 @@ int remap_f32(int src_type, const uchar *src_data, size_t src_step, return CV_HAL_ERROR_NOT_IMPLEMENTED; } - auto border_value = get_border_value(border_value_f64); - auto mt = get_multithreading(); - // Only implement CV_8UC1 so far if (src_type == CV_8UC1) { + auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_f32_u8( src_data, src_step, static_cast(src_width), static_cast(src_height), dst_data, dst_step, static_cast(dst_width), static_cast(dst_height), 1, mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, kleidicv_border_type, border_value.data(), mt)); + } else if (src_type == CV_16UC1) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_f32_u16( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 1, + mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, + kleidicv_border_type, border_value.data(), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index dff501676..dd64629e7 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -160,9 +160,9 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, int remap_f32(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, - int dst_width, int dst_height, float *mapx, size_t mapx_step, - float *mapy, size_t mapy_step, int interpolation, int border_type, - const double border_value[4]); + int dst_width, int dst_height, const float *mapx, + size_t mapx_step, const float *mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]); int warp_perspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, @@ -427,13 +427,14 @@ static inline int kleidicv_remap_s16point5_with_fallback( static inline int kleidicv_remap_f32_with_fallback( int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, - int dst_height, float *mapx, size_t mapx_step, float *mapy, + int dst_height, const float *mapx, size_t mapx_step, const float *mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]) { return KLEIDICV_HAL_FALLBACK_FORWARD( remap_f32, cv_hal_remap32f, src_type, src_data, src_step, src_width, - src_height, dst_data, dst_step, dst_width, dst_height, mapx, mapx_step, - mapy, mapy_step, interpolation, border_type, border_value); + src_height, dst_data, dst_step, dst_width, dst_height, + const_cast(mapx), mapx_step, const_cast(mapy), + mapy_step, interpolation, border_type, border_value); } #undef cv_hal_remap32f #define cv_hal_remap32f kleidicv_remap_f32_with_fallback diff --git a/adapters/opencv/opencv-4.11.patch b/adapters/opencv/opencv-4.11.patch index 2029c3026..85f8d786b 100644 --- a/adapters/opencv/opencv-4.11.patch +++ b/adapters/opencv/opencv-4.11.patch @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // SPDX-FileCopyrightText: Copyright (C) 2000-2022, Intel Corporation, all rights reserved. // SPDX-FileCopyrightText: Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // SPDX-FileCopyrightText: Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. @@ -228,20 +228,3 @@ index 673c6f03e6..56d9e0b554 100644 } void CV_ImageWarpBaseTest::validate_results() const -diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp -index e8840d231b..adfaad38b0 100644 ---- a/modules/imgproc/test/test_imgwarp.cpp -+++ b/modules/imgproc/test/test_imgwarp.cpp -@@ -1371,7 +1371,11 @@ TEST_P(Imgproc_RemapRelative, validity) - cv::remap(src, dstRelative, mapRelativeX32F, mapRelativeY32F, interpolation | WARP_RELATIVE_MAP, borderType); - } - -- EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0); -+ if (interpolation != INTER_LINEAR) { -+ EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0); -+ } else { // TODO: Check whether 4 is acceptable? f32_u8_linear_constant can be 4 -+ EXPECT_LT(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 5); // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test -+ } - }; - - INSTANTIATE_TEST_CASE_P(ImgProc, Imgproc_RemapRelative, testing::Combine( diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index 89d6b0eba..6ad48d32d 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -10,7 +10,7 @@ #include "opencv2/imgproc/hal/interface.h" #include "tests.h" -const int kMaxHeight = 32, kMaxWidth = 32; +const int kMaxHeight = 36, kMaxWidth = 32; template static cv::Mat get_source_mat(int format) { @@ -178,7 +178,7 @@ bool test_remap_f32(int index, RecreatedMessageQueue& request_queue, (CV_MAT_DEPTH(Format) == CV_8U && !are_matrices_different(2, actual_mat, expected_mat)) || (CV_MAT_DEPTH(Format) == CV_16U && - !are_matrices_different(2, actual_mat, expected_mat)); + !are_matrices_different(2048, actual_mat, expected_mat)); if (!success) { fail_print_matrices(w, h, source_mat, actual_mat, expected_mat); std::cout << "=== mapx_mat:" << std::endl; @@ -199,15 +199,18 @@ std::vector& remap_tests_get() { static std::vector tests = { TEST("RemapS16 uint8 Replicate", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16 uint16 Replicate", (test_remap_s16), (exec_remap_s16)), - TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), - TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), - TEST("RemapF32 uint8 Replicate", (test_remap_f32), (exec_remap_f32)), - TEST("RemapS16 uint8 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), + + TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + + TEST("RemapF32 uint8 Replicate", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Replicate", (test_remap_f32), (exec_remap_f32)), TEST("RemapF32 uint8 Constant", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Constant", (test_remap_f32), (exec_remap_f32)), }; // clang-format on return tests; diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 29c088ab0..e8bf0d008 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1882,12 +1882,24 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src, KLEIDICV_API_DECLARATION(kleidicv_remap_f32_u8, const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, - size_t dst_height, size_t channels, float *mapx, - size_t mapx_stride, float *mapy, size_t mapy_stride, + size_t dst_height, size_t channels, const float *mapx, + size_t mapx_stride, const float *mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value); +/// @copydoc kleidicv_remap_f32_u8 +KLEIDICV_API_DECLARATION(kleidicv_remap_f32_u16, const uint16_t *src, + size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, + size_t dst_height, size_t channels, const float *mapx, + size_t mapx_stride, const float *mapy, + size_t mapy_stride, + kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, + const uint16_t *border_value); + #ifndef DOXYGEN /// Internal - not part of the public API and its direct use is not supported. /// diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 6dc743859..6ef344715 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -55,7 +55,8 @@ inline bool remap_f32_is_implemented( size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, kleidicv_border_type_t border_type, size_t channels, kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING_COMPATIBLE { - if constexpr (std::is_same::value) { + if constexpr (std::is_same::value || + std::is_same::value) { return ( src_stride <= std::numeric_limits::max() && dst_width >= 4 && src_width <= @@ -100,8 +101,8 @@ template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value); @@ -132,8 +133,8 @@ template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value); diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/common_sc.h index f7a7eb018..1520dfd5c 100644 --- a/kleidicv/src/transform/common_sc.h +++ b/kleidicv/src/transform/common_sc.h @@ -90,11 +90,9 @@ svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y, } template -svuint32_t inline calculate_linear_replicate(svbool_t pg, svfloat32x2_t coords, - svfloat32_t xmaxf, - svfloat32_t ymaxf, - svuint32_t sv_src_stride, - Rows &src_rows) { +svuint32_t inline calculate_linear_replicated_border( + svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf, + svuint32_t sv_src_stride, Rows &src_rows) { auto load_source = [&](svuint32_t x, svuint32_t y) { return load_common(pg, x, y, sv_src_stride, src_rows); }; diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp index 9debeda61..9d31832c3 100644 --- a/kleidicv/src/transform/remap_api.cpp +++ b/kleidicv/src/transform/remap_api.cpp @@ -26,3 +26,7 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u16, KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u8, &kleidicv::neon::remap_f32, &kleidicv::sve2::remap_f32, nullptr); + +KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u16, + &kleidicv::neon::remap_f32, nullptr, + nullptr); diff --git a/kleidicv/src/transform/remap_neon.cpp b/kleidicv/src/transform/remap_neon.cpp index c671c1d9e..d61561777 100644 --- a/kleidicv/src/transform/remap_neon.cpp +++ b/kleidicv/src/transform/remap_neon.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include // TODO: check #include #include "kleidicv/kleidicv.h" @@ -1045,6 +1044,148 @@ class RemapF32Replicate { uint32x4_t v_ymax_; }; // end of class RemapF32Replicate +template +class RemapF32Replicate { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t + + RemapF32Replicate(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u32( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + vq_src_element_stride_{vdupq_n_u32( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_u32(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_u32(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapx, + Columns mapy, Columns dst) { + const size_t kStep = VecTraits::num_lanes(); + + auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { + uint32x4_t offset = vmlaq_u32(x, y, vq_src_element_stride_); + uint64_t acc = + static_cast(src_rows_[vgetq_lane_u32(offset, 0)]) | + (static_cast(src_rows_[vgetq_lane_u32(offset, 1)]) << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = static_cast(src_rows_[vgetq_lane_u32(offset, 2)]) | + (static_cast(src_rows_[vgetq_lane_u32(offset, 3)]) << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + }; + + auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) { + uint64x2_t offset_low = vmlal_u32(vmovl_u32(vget_low_u32(x)), + vget_low_u32(y), v_src_element_stride_); + uint64x2_t offset_high = + vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_element_stride_); + uint64_t acc = + static_cast(src_rows_[vgetq_lane_u64(offset_low, 0)]) | + (static_cast(src_rows_[vgetq_lane_u64(offset_low, 1)]) + << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = static_cast(src_rows_[vgetq_lane_u64(offset_high, 0)]) | + (static_cast(src_rows_[vgetq_lane_u64(offset_high, 1)]) + << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + }; + + auto load = [&](uint32x4_t x, uint32x4_t y) { + if constexpr (IsLarge) { + return load_src_into_floats_large(x, y); + } else { + return load_src_into_floats_small(x, y); + } + }; + + auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { + MapVectorType x = vld1q_f32(ptr_mapx); + MapVectorType y = vld1q_f32(ptr_mapy); + // Truncating convert to int + uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_); + uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_); + + // Get fractional part, or 0 if out of range + float32x4_t zero = vdupq_n_f32(0.F); + uint32x4_t x_in_range = + vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_)); + uint32x4_t y_in_range = + vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_)); + float32x4_t xfrac = + vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero); + float32x4_t yfrac = + vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero); + + // x1 = x0 + 1, except if it's already xmax or out of range + uint32x4_t x1 = vsubq_u32(x0, x_in_range); + uint32x4_t y1 = vsubq_u32(y0, y_in_range); + + // Calculate offsets from coordinates (y * stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + float32x4_t a = load(x0, y0); + float32x4_t b = load(x1, y0); + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t c = load(x0, y1); + float32x4_t d = load(x1, y1); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vminq_u32(vdupq_n_u32(0xFFFF), vcvtaq_u32_f32(result)); + }; + + auto vector_path_2 = [&](size_t step) { // step = 2*4 = 8 + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); + + ptr_mapx += kStep; + ptr_mapy += kStep; + uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); + uint16x8_t result16 = vuzp1q_u16(res0, res1); + + vst1q_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, kStep}; + loop.unroll_twice(vector_path_2); + loop.unroll_once([&](size_t step) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapx -= back_step; + mapy -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + }); + } + + private: + Rows src_rows_; + uint32x2_t v_src_element_stride_; // load_large + uint32x4_t vq_src_element_stride_; // load_small + uint32x4_t v_xmax_; + uint32x4_t v_ymax_; +}; // end of class RemapF32Replicate + template class RemapF32ConstantBorder; @@ -1212,14 +1353,171 @@ class RemapF32ConstantBorder { }; // end of class RemapF32ConstantBorder // NOLINTEND(readability-function-cognitive-complexity) +// TODO: Need to refactor to reduce the complexity +// NOLINTBEGIN(readability-function-cognitive-complexity) +template +class RemapF32ConstantBorder { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t + + RemapF32ConstantBorder(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + src_width_{src_width}, + src_height_{src_height}, + border_value_{border_value} {} + + void process_row(size_t width, Columns mapx, + Columns mapy, Columns dst) { + const size_t kStep = VecTraits::num_lanes(); + + auto get_edge_pixels = [&](unsigned &a_result, unsigned &b_result, + unsigned &c_result, unsigned &d_result, int x0, + int y0, ptrdiff_t index, + Rows src_rows, int src_width, + int src_height) { + if (y0 >= 0) { + if (x0 >= 0) { + a_result = src_rows[index]; + } + if (x0 + 1 < src_width) { + b_result = src_rows[index + 1]; + } + } + if (y0 + 1 < src_height) { + index += static_cast(src_rows.stride() / sizeof(ScalarType)); + if (x0 >= 0) { + c_result = src_rows[index]; + } + if (x0 + 1 < src_width) { + d_result = src_rows[index + 1]; + } + } + }; + + auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { + MapVectorType xf = vld1q_f32(ptr_mapx); + MapVectorType yf = vld1q_f32(ptr_mapy); + // Convert obviously out-of-range coordinates to values that are just + // beyond the largest permitted image width & height. This avoids the need + // for special case handling elsewhere. + float32x4_t big = vdupq_n_f32(1 << 24); + xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big); + yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big); + + int32x4_t x0 = vcvtmq_s32_f32(xf); + int32x4_t y0 = vcvtmq_s32_f32(yf); + int x0_array[4], y0_array[4]; + unsigned a_array[4], b_array[4], c_array[4], d_array[4]; + vst1q_s32(x0_array, x0); + vst1q_s32(y0_array, y0); + for (int i = 0; i < 4; ++i) { + int x0i = x0_array[i]; + int y0i = y0_array[i]; + ptrdiff_t index = + x0i + y0i * static_cast(src_rows_.stride() / + sizeof(ScalarType)); + // std::cout << "x0i " << x0i << " y0i " << y0i << " index: " + // << index + // xw << "\n"; + // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed + if (x0i < 0 || x0i + 1 >= static_cast(src_width_) || y0i < 0 || + y0i + 1 >= static_cast(src_height_)) { + // Not entirely within the source image + + a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0]; + + if (x0i < -1 || x0i >= static_cast(src_width_) || y0i < -1 || + y0i >= static_cast(src_height_)) { + // Completely outside the source image + continue; + } + + get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, + y0i, index, src_rows_, src_width_, src_height_); + continue; + } + + // Completely inside the source image + a_array[i] = src_rows_[index]; + b_array[i] = src_rows_[index + 1]; + index += src_rows_.stride() / sizeof(ScalarType); + c_array[i] = src_rows_[index]; + d_array[i] = src_rows_[index + 1]; + } + + float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array)); + float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array)); + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array)); + float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array)); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vcvtaq_u32_f32(result); + }; + + auto vector_path_2 = [&](size_t step) { // step = 2*4 = 8 + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); + + ptr_mapx += kStep; + ptr_mapy += kStep; + uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); + uint16x8_t result16 = vuzp1q_u16(res0, res1); + + vst1q_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, kStep}; + loop.unroll_twice(vector_path_2); + loop.unroll_once([&](size_t step) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapx -= back_step; + mapy -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + }); + } + + private: + Rows src_rows_; + size_t src_width_; + size_t src_height_; + const ScalarType *border_value_; +}; // end of class RemapF32ConstantBorder +// NOLINTEND(readability-function-cognitive-complexity) + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, [[maybe_unused]] const T *border_value) { @@ -1301,10 +1599,12 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32( \ const type *src, size_t src_stride, size_t src_width, size_t src_height, \ type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, float *mapx, size_t mapx_stride, float *mapy, \ - size_t mapy_stride, kleidicv_interpolation_type_t interpolation, \ + size_t channels, const float *mapx, size_t mapx_stride, \ + const float *mapy, size_t mapy_stride, \ + kleidicv_interpolation_type_t interpolation, \ kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t); } // namespace kleidicv::neon diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index 8cf574b91..d66bf34c4 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -858,7 +858,7 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, auto calculate_linear = [&](svbool_t pg, uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_replicate( + return calculate_linear_replicated_border( pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); @@ -925,8 +925,9 @@ template kleidicv_error_t remap_f32_sc(const T* src, size_t src_stride, size_t src_width, size_t src_height, T* dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, float* mapx, size_t mapx_stride, - float* mapy, size_t mapy_stride, + size_t channels, const float* mapx, + size_t mapx_stride, const float* mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, [[maybe_unused]] const T* border_value) { diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp index 624b7ce24..611eba8e1 100644 --- a/kleidicv/src/transform/remap_sve2.cpp +++ b/kleidicv/src/transform/remap_sve2.cpp @@ -37,15 +37,15 @@ template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value) { - return remap_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height, channels, - mapx, mapx_stride, mapy, mapy_stride, - interpolation, border_type, border_value); + return remap_f32_sc(src, src_stride, src_width, src_height, dst, + dst_stride, dst_width, dst_height, channels, mapx, + mapx_stride, mapy, mapy_stride, interpolation, + border_type, border_value); } #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ @@ -73,8 +73,9 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32( \ const type *src, size_t src_stride, size_t src_width, size_t src_height, \ type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, float *mapx, size_t mapx_stride, float *mapy, \ - size_t mapy_stride, kleidicv_interpolation_type_t interpolation, \ + size_t channels, const float *mapx, size_t mapx_stride, \ + const float *mapy, size_t mapy_stride, \ + kleidicv_interpolation_type_t interpolation, \ kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp index 1f4c6dbf4..c42bc0ef2 100644 --- a/kleidicv/src/transform/warp_perspective_neon.cpp +++ b/kleidicv/src/transform/warp_perspective_neon.cpp @@ -299,7 +299,7 @@ void warp_perspective_operation(Rows src_rows, return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); }; - auto calculate_linear_replicate = [&](uint32_t x) { + auto calculate_linear_replicated_border = [&](uint32_t x) { auto load_floats = [&](uint32x4_t x, uint32x4_t y) { if constexpr (IsLarge) { return load_src_into_floats_large(x, y); @@ -341,7 +341,7 @@ void warp_perspective_operation(Rows src_rows, auto calculate_linear = [&](uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - return calculate_linear_replicate(x); + return calculate_linear_replicated_border(x); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); auto &&[xf, yf] = calculate_coordinates(x); diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h index 9656d318c..d75958b86 100644 --- a/kleidicv/src/transform/warp_perspective_sc.h +++ b/kleidicv/src/transform/warp_perspective_sc.h @@ -171,7 +171,7 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, auto calculate_linear = [&](svbool_t pg, uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_replicate( + return calculate_linear_replicated_border( pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index dfcdc7cae..76b7a8bae 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -435,11 +435,23 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u16( kleidicv_error_t kleidicv_thread_remap_f32_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, float *mapx, size_t mapx_stride, float *mapy, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_remap_f32_u16 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_remap_f32_u16( + const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, const uint16_t *border_value, + kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_warp_perspective_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 2304946d4..b8368e013 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -760,7 +760,7 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u16( kleidicv_error_t kleidicv_thread_remap_f32_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, float *mapx, size_t mapx_stride, float *mapy, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading mt) { @@ -782,6 +782,31 @@ kleidicv_error_t kleidicv_thread_remap_f32_u8( return parallel_batches(callback, mt, dst_height); } +kleidicv_error_t kleidicv_thread_remap_f32_u16( + const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, const uint16_t *border_value, + kleidicv_thread_multithreading mt) { + if (!kleidicv::remap_f32_is_implemented( + src_stride, src_width, src_height, dst_width, border_type, channels, + interpolation)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_remap_f32_u16( + src, src_stride, src_width, src_height, + dst + static_cast(begin * dst_stride / sizeof(uint16_t)), + dst_stride, dst_width, end - begin, channels, + mapx + static_cast(begin * mapx_stride / sizeof(float)), + mapx_stride, + mapy + static_cast(begin * mapy_stride / sizeof(float)), + mapy_stride, interpolation, border_type, border_value); + }; + return parallel_batches(callback, mt, dst_height); +} + kleidicv_error_t kleidicv_thread_warp_perspective_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 72de4a4bb..790e97b77 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -26,6 +26,12 @@ KLEIDICV_REMAP_S16(uint16_t, u16); KLEIDICV_REMAP_S16POINT5(uint8_t, u8); KLEIDICV_REMAP_S16POINT5(uint16_t, u16); +#define KLEIDICV_REMAP_F32(type, type_suffix) \ + KLEIDICV_API(remap_f32, kleidicv_remap_f32_##type_suffix, type) + +KLEIDICV_REMAP_F32(uint8_t, u8); +KLEIDICV_REMAP_F32(uint16_t, u16); + template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, @@ -560,7 +566,7 @@ class RemapS16Point5 : public testing::Test { mapxy.data(), mapxy.stride(), mapfrac.data(), mapfrac.stride(), border_type, border_value)); - EXPECT_EQ_ARRAY2D(actual, expected); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } private: @@ -858,12 +864,11 @@ class RemapF32 : public testing::Test { size_t dst_h, size_t channels, kleidicv_border_type_t border_type, const ScalarType *border_value, size_t padding) { + test::PseudoRandomNumberGenerator coord_generator; test::Array2D mapx(dst_w, dst_h, padding); - test::PseudoRandomNumberGenerator xcoord_generator; - mapx.fill(xcoord_generator); test::Array2D mapy(dst_w, dst_h, padding); - test::PseudoRandomNumberGenerator ycoord_generator; - mapy.fill(ycoord_generator); + mapx.fill(coord_generator); + mapy.fill(coord_generator); execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type, border_value, padding); } @@ -949,8 +954,8 @@ class RemapF32 : public testing::Test { } } - // This part is the same as execute_test() but without initializing source. - // Corner Cases use the biggest possible source. + // This part is the same as execute_test() but without initializing the + // whole source. Corner Cases use the biggest possible source. size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -959,9 +964,12 @@ class RemapF32 : public testing::Test { test::Array2D expected{dst_total_width, dst_h, padding, channels}; - const int64_t kMaxVal = std::numeric_limits::max(); + // Initalize the edges only + const int64_t kMaxVal = std::numeric_limits::max() * 3 / 4; + const int64_t kMinVal = + std::numeric_limits::lowest() + kMaxVal / 3; auto generateSource = [&](size_t x, size_t y) { - return static_cast((x + y) % 2 ? kMaxVal : 27); + return static_cast((x + y) % 2 ? kMaxVal : kMinVal); }; for (size_t y = 0; y < src_h; ++y) { *source.at(y, 0) = generateSource(y, 0); @@ -987,13 +995,28 @@ class RemapF32 : public testing::Test { ASSERT_EQ( KLEIDICV_OK, - kleidicv_remap_f32_u8( + remap_f32()( source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(), KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value)); - EXPECT_EQ_ARRAY2D(actual, expected); + if (expected.compare_to(actual, 1)) { + if (source.width() < 100 && source.height() < 100) { + std::cout << "source:\n"; + dump(&source); + } + std::cout << "mapx:\n"; + dump(&mapx); + std::cout << "mapy:\n"; + dump(&mapy); + std::cout << "expected:\n"; + dump(&expected); + std::cout << "actual:\n"; + dump(&actual); + } + + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } private: @@ -1017,12 +1040,27 @@ class RemapF32 : public testing::Test { ASSERT_EQ( KLEIDICV_OK, - kleidicv_remap_f32_u8( + remap_f32()( source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(), KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value)); + if (expected.compare_to(actual, 1)) { + if (source.width() < 100 && source.height() < 100) { + std::cout << "source:\n"; + dump(&source); + } + std::cout << "mapx:\n"; + dump(&mapx); + std::cout << "mapy:\n"; + dump(&mapy); + std::cout << "expected:\n"; + dump(&expected); + std::cout << "actual:\n"; + dump(&actual); + } + EXPECT_EQ_ARRAY2D(actual, expected); } @@ -1043,8 +1081,6 @@ class RemapF32 : public testing::Test { for (size_t ch = 0; ch < src.channels(); ++ch) { float x = *mapx.at(row, column); float y = *mapy.at(row, column); - // ptrdiff_t ix = std::floor(x); - // ptrdiff_t iy = std::floor(y); ptrdiff_t ix = static_cast(std::max( INT_MIN, std::min(std::floor(x), @@ -1070,7 +1106,7 @@ class RemapF32 : public testing::Test { } }; -using RemapF32ElementTypes = ::testing::Types; +using RemapF32ElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapF32, RemapF32ElementTypes); TYPED_TEST(RemapF32, RandomNoPadding) { @@ -1169,7 +1205,7 @@ TYPED_TEST(RemapF32, NullPointer) { float mapy[1] = {}; const size_t mapy_stride = dst_width * sizeof(float); const TypeParam border_value[1] = {}; - test::test_null_args(kleidicv_remap_f32_u8, src, src_stride, src_width, + test::test_null_args(remap_f32(), src, src_stride, src_width, src_height, dst, dst_stride, dst_width, dst_height, channels, mapx, mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, @@ -1187,15 +1223,15 @@ TYPED_TEST(RemapF32, ZeroImageSize) { const size_t mapy_stride = sizeof(float); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, src_stride, 0, 1, dst, dst_stride, 0, 1, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, src_stride, 0, 1, dst, dst_stride, 0, 1, + 1, mapx, mapx_stride, mapy, mapy_stride, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, src_stride, 1, 0, dst, dst_stride, 1, 0, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, src_stride, 1, 0, dst, dst_stride, 1, 0, + 1, mapx, mapx_stride, mapy, mapy_stride, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, InvalidImageSize) { @@ -1207,32 +1243,32 @@ TYPED_TEST(RemapF32, InvalidImageSize) { EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, - dst, element_size, 1, 1, 1, mapx, sizeof(float), - mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS + 1, + 1, dst, element_size, 1, 1, 1, mapx, sizeof(float), + mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS, - KLEIDICV_MAX_IMAGE_PIXELS, dst, element_size, - 1, 1, 1, mapx, sizeof(float), mapy, - sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS, dst, element_size, + 1, 1, 1, mapx, sizeof(float), mapy, + sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, element_size, - KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapx, - sizeof(float), mapy, sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, element_size, + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapx, + sizeof(float), mapy, sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, element_size, - KLEIDICV_MAX_IMAGE_PIXELS, - KLEIDICV_MAX_IMAGE_PIXELS, 1, mapx, sizeof(float), - mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, element_size, + KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS, 1, mapx, sizeof(float), + mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedTwoChannels) { @@ -1245,10 +1281,10 @@ TYPED_TEST(RemapF32, UnsupportedTwoChannels) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16, - 1, channels, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 16, 1, channels, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedInterpolationTypeNEAREST) { @@ -1260,10 +1296,10 @@ TYPED_TEST(RemapF32, UnsupportedInterpolationTypeNEAREST) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16, - 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 16, 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_NEAREST, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedTooSmallImage) { @@ -1275,10 +1311,10 @@ TYPED_TEST(RemapF32, UnsupportedTooSmallImage) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 3, - 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, 3, + 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigStride) { @@ -1292,10 +1328,10 @@ TYPED_TEST(RemapF32, UnsupportedBigStride) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, src_stride, 1, 1, dst, 16 * element_size, 16, - 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, src_stride, 1, 1, dst, 16 * element_size, 16, + 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) { @@ -1306,18 +1342,18 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) { float mapy[16] = {}; EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1ULL << 24, 1, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1ULL << 24, 1, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, (1ULL << 32) + 1, 1, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, (1ULL << 32) + 1, 1, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) { @@ -1328,18 +1364,18 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) { float mapy[16] = {}; EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1ULL << 24, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1ULL << 24, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, (1ULL << 32) + 1, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, (1ULL << 32) + 1, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) { @@ -1351,10 +1387,10 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) { EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, - 1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) { @@ -1366,18 +1402,41 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) { EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16, - 1ULL << 24, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 16, 1ULL << 24, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, Misalignment) { - const size_t element_size = sizeof(TypeParam); - if (element_size == 1) { - // misalignment impossible - GTEST_SKIP(); + const TypeParam src[8] = {}; + TypeParam dst[8]; + const size_t data_stride_ok = sizeof(TypeParam); + const size_t data_stride_nok = sizeof(TypeParam) + 1; + float mapx[8] = {}; + float mapy[8] = {}; + const size_t map_stride_ok = sizeof(float); + const size_t map_stride_nok = sizeof(float) + 1; + auto I = KLEIDICV_INTERPOLATION_LINEAR; + auto B = KLEIDICV_BORDER_TYPE_REPLICATE; + + // Is misalignment of the data possible? + if (data_stride_ok != 1) { + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()( + src, data_stride_nok, 4, 2, dst, data_stride_ok, 4, 2, 1, + mapx, map_stride_ok, mapy, map_stride_ok, I, B, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()( + src, data_stride_ok, 4, 2, dst, data_stride_nok, 4, 2, 1, + mapx, map_stride_ok, mapy, map_stride_ok, I, B, nullptr)); } - - // Will be needed when supporting uint16_t + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()( + src, data_stride_nok, 4, 2, dst, data_stride_ok, 4, 2, 1, mapx, + map_stride_nok, mapy, map_stride_ok, I, B, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()(src, data_stride_ok, 4, 2, dst, + data_stride_ok, 4, 2, 1, mapx, map_stride_ok, + mapy, map_stride_nok, I, B, nullptr)); } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index c930b4757..c1a53f6da 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -788,6 +788,33 @@ TEST_P(Thread, remap_f32_u8_not_implemented) { KLEIDICV_BORDER_TYPE_REFLECT, &border_value); } +TEST_P(Thread, remap_f32_u16_border_replicate) { + check_remap_f32( + kleidicv_remap_f32_u16, kleidicv_thread_remap_f32_u16, 1, + KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + +TEST_P(Thread, remap_f32_u16_border_constant) { + const uint16_t border_value = 0; + check_remap_f32(kleidicv_remap_f32_u16, + kleidicv_thread_remap_f32_u16, 1, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + +TEST_P(Thread, remap_f32_u16_not_implemented) { + const uint16_t border_value = 0; + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 2, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_NEAREST, + KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REFLECT, &border_value); +} + TEST_P(Thread, warp_perspective_u8_border_replicate) { const uint8_t border_value = 0; check_warp_perspective( -- GitLab From 22431b6aefb5ced421087e67a2ac31686b99f0df Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Tue, 11 Feb 2025 14:57:14 +0000 Subject: [PATCH 2/2] Implement RemapF32 for u16 data, for SVE2 --- kleidicv/src/transform/common_sc.h | 57 +++++++++++++++++-------- kleidicv/src/transform/remap_api.cpp | 4 +- kleidicv/src/transform/remap_sc.h | 61 ++++++++++++++++++--------- kleidicv/src/transform/remap_sve2.cpp | 1 + test/api/test_remap.cpp | 50 +++++++++++++--------- 5 files changed, 114 insertions(+), 59 deletions(-) diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/common_sc.h index 1520dfd5c..17ac56866 100644 --- a/kleidicv/src/transform/common_sc.h +++ b/kleidicv/src/transform/common_sc.h @@ -68,24 +68,45 @@ template svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y, svuint32_t sv_src_stride, Rows &src_rows) { - if constexpr (IsLarge) { - svbool_t pg_b = pg; - svbool_t pg_t = svtrn2_b32(pg, svpfalse()); - - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits - svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); - svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); - // Copy pixels from source - svuint64_t result_b = - svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); - svuint64_t result_t = - svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); - return svtrn1_u32(svreinterpret_u32_u64(result_b), - svreinterpret_u32_u64(result_t)); - } else { - svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); - return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); + if constexpr (std::is_same::value) { + if constexpr (IsLarge) { + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b = + svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + svuint64_t result_t = + svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + } else { + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); + return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); + } + } else if constexpr (std::is_same::value) { + if constexpr (IsLarge) { + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b, result_t; + result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + } else { + svuint32_t offsets = + svmla_x(pg, svlsl_n_u32_x(pg, x, 1), y, sv_src_stride); + return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets); + } } } diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp index 9d31832c3..28ed84d3c 100644 --- a/kleidicv/src/transform/remap_api.cpp +++ b/kleidicv/src/transform/remap_api.cpp @@ -28,5 +28,5 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u8, &kleidicv::sve2::remap_f32, nullptr); KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u16, - &kleidicv::neon::remap_f32, nullptr, - nullptr); + &kleidicv::neon::remap_f32, + &kleidicv::sve2::remap_f32, nullptr); diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index d66bf34c4..dc52a2931 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -821,8 +821,8 @@ kleidicv_error_t remap_s16point5_sc( } return KLEIDICV_OK; } -// NOLINTEND(readability-function-cognitive-complexity) +// TODO reduce functional complexity template void remap32f_process_rows(Rows src_rows, size_t src_width, @@ -876,33 +876,55 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, assert(!"INTER_NEAREST not implemented for RemapF32"); // GCOVR_EXCL_STOP } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { - loop.unroll_four_times([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - x += kStep; - res0 = calculate_linear(pg_all32, x); - x += kStep; - res1 = calculate_linear(pg_all32, x); - svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + if constexpr (std::is_same::value) { + loop.unroll_four_times([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + x += kStep; + res0 = calculate_linear(pg_all32, x); + x += kStep; + res1 = calculate_linear(pg_all32, x); + svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + svst1_u8(svptrue_b8(), p_dst, + svuzp1_u8(svreinterpret_u8_u16(result16_0), + svreinterpret_u8_u16(result16_1))); + }); + } else if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16 = svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); - svst1_u8(svptrue_b8(), p_dst, - svuzp1_u8(svreinterpret_u8_u16(result16_0), - svreinterpret_u8_u16(result16_1))); - }); + svst1_u16(svptrue_b16(), p_dst, result16); + }); + } loop.unroll_once([&](size_t x) { ScalarType* p_dst = &dst[static_cast(x)]; svuint32_t result = calculate_linear(pg_all32, x); - svst1b_u32(pg_all32, p_dst, result); + if constexpr (std::is_same::value) { + svst1b_u32(pg_all32, p_dst, result); + } + if constexpr (std::is_same::value) { + svst1h_u32(pg_all32, p_dst, result); + } }); loop.remaining([&](size_t x, size_t x_max) { ScalarType* p_dst = &dst[static_cast(x)]; svbool_t pg32 = svwhilelt_b32(x, x_max); svuint32_t result = calculate_linear(pg32, x); - svst1b_u32(pg32, p_dst, result); + if constexpr (std::is_same::value) { + svst1b_u32(pg32, p_dst, result); + } + if constexpr (std::is_same::value) { + svst1h_u32(pg32, p_dst, result); + } }); } else { static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || @@ -918,6 +940,7 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, ++dst_rows; } } +// NOLINTEND(readability-function-cognitive-complexity) // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp index 611eba8e1..155b8d88c 100644 --- a/kleidicv/src/transform/remap_sve2.cpp +++ b/kleidicv/src/transform/remap_sve2.cpp @@ -79,5 +79,6 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t); } // namespace kleidicv::sve2 diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 790e97b77..5966dd191 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -1061,7 +1061,7 @@ class RemapF32 : public testing::Test { dump(&actual); } - EXPECT_EQ_ARRAY2D(actual, expected); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } static void calculate_expected(test::Array2D &src, @@ -1212,26 +1212,36 @@ TYPED_TEST(RemapF32, NullPointer) { KLEIDICV_BORDER_TYPE_CONSTANT, border_value); } -TYPED_TEST(RemapF32, ZeroImageSize) { - const TypeParam src[1] = {}; - TypeParam dst[1]; - const size_t src_stride = sizeof(TypeParam); - const size_t dst_stride = sizeof(TypeParam); - float mapx[1] = {}; - float mapy[1] = {}; - const size_t mapx_stride = sizeof(float); - const size_t mapy_stride = sizeof(float); +TYPED_TEST(RemapF32, ZeroHeightImage) { + const size_t kW = 4; + const TypeParam src[kW] = {}; + TypeParam dst[kW]; + const size_t src_stride = kW * sizeof(TypeParam); + const size_t big_stride = (1UL << 32UL) - sizeof(TypeParam); + const size_t dst_stride = kW * sizeof(TypeParam); + float mapx[kW] = {}; + float mapy[kW] = {}; + const size_t mapx_stride = kW * sizeof(float); + const size_t mapy_stride = kW * sizeof(float); - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_f32()(src, src_stride, 0, 1, dst, dst_stride, 0, 1, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_f32()(src, src_stride, 1, 0, dst, dst_stride, 1, 0, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + for (auto [border_type, border_value] : get_borders()) { + EXPECT_EQ(KLEIDICV_OK, + remap_f32()(src, src_stride, kW, 1, dst, dst_stride, + kW, 0, 1, mapx, mapx_stride, mapy, + mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, + border_type, border_value)); + EXPECT_EQ(KLEIDICV_OK, + remap_f32()(src, big_stride, kW, 2, dst, dst_stride, + kW, 0, 1, mapx, mapx_stride, mapy, + mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, + border_type, border_value)); + } + const TypeParam border_value[1] = {0}; + EXPECT_EQ(KLEIDICV_OK, + remap_f32()( + src, src_stride, kW, 0, dst, dst_stride, kW, 1, 1, mapx, + mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, border_value)); } TYPED_TEST(RemapF32, InvalidImageSize) { -- GitLab