From 3570f79800dd992d66e36bc5c5ce177e3eeaeee2 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 12 Feb 2025 05:12:11 +0000 Subject: [PATCH 1/3] Implement RemapF32 for u16 data, for Neon Patching test_imgwarp is removed, because Imgproc_RemapRelative test is also removed from the list, since WARP_RELATIVE_MAP is not implemented in KleidiCV. --- adapters/opencv/kleidicv_hal.cpp | 18 +- adapters/opencv/kleidicv_hal.h | 13 +- adapters/opencv/opencv-4.11.patch | 19 +- conformity/opencv/test_remap.cpp | 67 +++- kleidicv/include/kleidicv/kleidicv.h | 16 +- kleidicv/include/kleidicv/transform/remap.h | 11 +- kleidicv/src/transform/common_sc.h | 8 +- kleidicv/src/transform/remap_api.cpp | 4 + kleidicv/src/transform/remap_neon.cpp | 310 +++++++++++++++++- kleidicv/src/transform/remap_sc.h | 7 +- kleidicv/src/transform/remap_sve2.cpp | 17 +- .../src/transform/warp_perspective_neon.cpp | 4 +- kleidicv/src/transform/warp_perspective_sc.h | 2 +- .../include/kleidicv_thread/kleidicv_thread.h | 14 +- kleidicv_thread/src/kleidicv_thread.cpp | 27 +- scripts/ci-opencv.sh | 1 + test/api/test_remap.cpp | 247 ++++++++------ test/api/test_thread.cpp | 27 ++ 18 files changed, 638 insertions(+), 174 deletions(-) diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index c5b82b23b..e73c350fa 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1379,8 +1379,9 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, int remap_f32(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, - int dst_width, int dst_height, float *mapx, size_t mapx_step, - float *mapy, size_t mapy_step, int interpolation, int border_type, + int dst_width, int dst_height, const float *mapx, + size_t mapx_step, const float *mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value_f64[4]) { kleidicv_border_type_t kleidicv_border_type; if (from_opencv(border_type, kleidicv_border_type)) { @@ -1392,18 +1393,25 @@ int remap_f32(int src_type, const uchar *src_data, size_t src_step, return CV_HAL_ERROR_NOT_IMPLEMENTED; } - auto border_value = get_border_value(border_value_f64); - auto mt = get_multithreading(); - // Only implement CV_8UC1 so far if (src_type == CV_8UC1) { + auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_f32_u8( src_data, src_step, static_cast(src_width), static_cast(src_height), dst_data, dst_step, static_cast(dst_width), static_cast(dst_height), 1, mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, kleidicv_border_type, border_value.data(), mt)); + } else if (src_type == CV_16UC1) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_f32_u16( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 1, + mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, + kleidicv_border_type, border_value.data(), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index dff501676..dd64629e7 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -160,9 +160,9 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, int remap_f32(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, - int dst_width, int dst_height, float *mapx, size_t mapx_step, - float *mapy, size_t mapy_step, int interpolation, int border_type, - const double border_value[4]); + int dst_width, int dst_height, const float *mapx, + size_t mapx_step, const float *mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]); int warp_perspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, @@ -427,13 +427,14 @@ static inline int kleidicv_remap_s16point5_with_fallback( static inline int kleidicv_remap_f32_with_fallback( int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, - int dst_height, float *mapx, size_t mapx_step, float *mapy, + int dst_height, const float *mapx, size_t mapx_step, const float *mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]) { return KLEIDICV_HAL_FALLBACK_FORWARD( remap_f32, cv_hal_remap32f, src_type, src_data, src_step, src_width, - src_height, dst_data, dst_step, dst_width, dst_height, mapx, mapx_step, - mapy, mapy_step, interpolation, border_type, border_value); + src_height, dst_data, dst_step, dst_width, dst_height, + const_cast(mapx), mapx_step, const_cast(mapy), + mapy_step, interpolation, border_type, border_value); } #undef cv_hal_remap32f #define cv_hal_remap32f kleidicv_remap_f32_with_fallback diff --git a/adapters/opencv/opencv-4.11.patch b/adapters/opencv/opencv-4.11.patch index 2029c3026..85f8d786b 100644 --- a/adapters/opencv/opencv-4.11.patch +++ b/adapters/opencv/opencv-4.11.patch @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // SPDX-FileCopyrightText: Copyright (C) 2000-2022, Intel Corporation, all rights reserved. // SPDX-FileCopyrightText: Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // SPDX-FileCopyrightText: Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. @@ -228,20 +228,3 @@ index 673c6f03e6..56d9e0b554 100644 } void CV_ImageWarpBaseTest::validate_results() const -diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp -index e8840d231b..adfaad38b0 100644 ---- a/modules/imgproc/test/test_imgwarp.cpp -+++ b/modules/imgproc/test/test_imgwarp.cpp -@@ -1371,7 +1371,11 @@ TEST_P(Imgproc_RemapRelative, validity) - cv::remap(src, dstRelative, mapRelativeX32F, mapRelativeY32F, interpolation | WARP_RELATIVE_MAP, borderType); - } - -- EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0); -+ if (interpolation != INTER_LINEAR) { -+ EXPECT_EQ(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 0); -+ } else { // TODO: Check whether 4 is acceptable? f32_u8_linear_constant can be 4 -+ EXPECT_LT(cvtest::norm(dstAbsolute, dstRelative, NORM_INF), 5); // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test -+ } - }; - - INSTANTIATE_TEST_CASE_P(ImgProc, Imgproc_RemapRelative, testing::Combine( diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index 89d6b0eba..316ebfdfe 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -10,22 +10,20 @@ #include "opencv2/imgproc/hal/interface.h" #include "tests.h" -const int kMaxHeight = 32, kMaxWidth = 32; +const int kMaxHeight = 36, kMaxWidth = 32; template static cv::Mat get_source_mat(int format) { auto generate_source = [&]() { cv::Mat m(kMaxHeight, kMaxWidth, format); + const int64_t kMaxValue = std::numeric_limits::max(); for (size_t row = 0; row < kMaxHeight; ++row) { for (size_t column = 0; column < kMaxWidth; ++column) { - // Referring to the conformity check in test_warp_perspective - // Remap calculation in float greatly amplifies any small errors - // coming from precision innaccuracies, but ensuring that neighbouring - // pixels have neighbouring values decreases this effect, so it can be - // expected that the error won't be bigger than 1. - const int kMaxValue = std::numeric_limits::max(); - m.at(row, column) = abs( - static_cast(row + column) % (2 * kMaxValue + 1) - kMaxValue); + // Create as many different differences between neighbouring pixels as + // possible + size_t counter = row + column; + m.at(row, column) = + (counter % 2) ? kMaxValue : (counter % (kMaxValue + 1)); } } return m; @@ -159,8 +157,8 @@ bool test_remap_f32(int index, RecreatedMessageQueue& request_queue, cv::Mat source_mat = get_source_mat(Format); cv::RNG rng(0); - for (size_t w = 5; w <= kMaxWidth; w += 3) { - for (size_t h = 5; h <= kMaxHeight; h += 2) { + for (size_t w = 5; w <= kMaxWidth * 2; w += 3) { + for (size_t h = 5; h <= kMaxHeight * 2; h += 2) { cv::Mat map_mat(h * 2, w, CV_32FC1); cv::Mat mapx_mat = map_mat.rowRange(0, h); rng.fill(mapx_mat, cv::RNG::UNIFORM, -3, kMaxWidth + 3); @@ -174,11 +172,41 @@ bool test_remap_f32(int index, RecreatedMessageQueue& request_queue, cv::Mat expected_mat = get_expected_from_subordinate( index, request_queue, reply_queue, map_mat); + // clang-format off + // Reference algorithm in OpenCV uses integer interpolation with 5 bits + // fractional part. That means that the maximum error between that and the + // exact result in one dimension can be as big as 2^data_bits / 2^5 / 2 + // (we cannot have more than 2^32 different results), for two dimensions + // it is the double, which is 8 for u8 and 2048 for u16. + // + // Example in 16 bits: + // Source height = 36, width = 32; + // mapx = 31.17005, mapy = 35.326836 + // so this is a corner case (bottom right corner): + // 65469 123 + // 123 123 (constant border) + // + // Interpolation: + // xfrac = 0.17005, yfrac = 0.326836 + // line0 = 65469 * (1 - 0.17005) + 123 * 0.17005 = 54356.9127 + // line1 = 123 + // EXACT RESULT calculated in float: + // 54356.9127 * (1 - 0.326836) + 123 * 0.326836 = 36631.3176087828 + // + // WITH 5-bit fractional part: + // xfrac is rounded to 0.15625 (5/32) + // line0 = 65469 * (1 - 0.15625) + 123 * 0.15625 = 55258.6875 + // (diff is less than 1024, as provisioned) + // 2nd dimension: line1 = 123 + // yfrac is rounded to 0.3125 (10/32) + // 5BIT RESULT: + // 55258.6875 * (1 - 0.3125) + 123 * 0.3125 = 38028.78515625 + // clang-format on bool success = (CV_MAT_DEPTH(Format) == CV_8U && - !are_matrices_different(2, actual_mat, expected_mat)) || + !are_matrices_different(8, actual_mat, expected_mat)) || (CV_MAT_DEPTH(Format) == CV_16U && - !are_matrices_different(2, actual_mat, expected_mat)); + !are_matrices_different(2048, actual_mat, expected_mat)); if (!success) { fail_print_matrices(w, h, source_mat, actual_mat, expected_mat); std::cout << "=== mapx_mat:" << std::endl; @@ -199,15 +227,18 @@ std::vector& remap_tests_get() { static std::vector tests = { TEST("RemapS16 uint8 Replicate", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16 uint16 Replicate", (test_remap_s16), (exec_remap_s16)), - TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), - TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), - TEST("RemapF32 uint8 Replicate", (test_remap_f32), (exec_remap_f32)), - TEST("RemapS16 uint8 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), + + TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), - TEST("RemapF32 uint8 Constant", (test_remap_f32), (exec_remap_f32)), + + TEST("RemapF32 uint8 Replicate", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Replicate", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Constant", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Constant", (test_remap_f32), (exec_remap_f32)), }; // clang-format on return tests; diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 29c088ab0..e8bf0d008 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1882,12 +1882,24 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src, KLEIDICV_API_DECLARATION(kleidicv_remap_f32_u8, const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, - size_t dst_height, size_t channels, float *mapx, - size_t mapx_stride, float *mapy, size_t mapy_stride, + size_t dst_height, size_t channels, const float *mapx, + size_t mapx_stride, const float *mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value); +/// @copydoc kleidicv_remap_f32_u8 +KLEIDICV_API_DECLARATION(kleidicv_remap_f32_u16, const uint16_t *src, + size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, + size_t dst_height, size_t channels, const float *mapx, + size_t mapx_stride, const float *mapy, + size_t mapy_stride, + kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, + const uint16_t *border_value); + #ifndef DOXYGEN /// Internal - not part of the public API and its direct use is not supported. /// diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 6dc743859..6ef344715 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -55,7 +55,8 @@ inline bool remap_f32_is_implemented( size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, kleidicv_border_type_t border_type, size_t channels, kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING_COMPATIBLE { - if constexpr (std::is_same::value) { + if constexpr (std::is_same::value || + std::is_same::value) { return ( src_stride <= std::numeric_limits::max() && dst_width >= 4 && src_width <= @@ -100,8 +101,8 @@ template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value); @@ -132,8 +133,8 @@ template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value); diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/common_sc.h index f7a7eb018..1520dfd5c 100644 --- a/kleidicv/src/transform/common_sc.h +++ b/kleidicv/src/transform/common_sc.h @@ -90,11 +90,9 @@ svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y, } template -svuint32_t inline calculate_linear_replicate(svbool_t pg, svfloat32x2_t coords, - svfloat32_t xmaxf, - svfloat32_t ymaxf, - svuint32_t sv_src_stride, - Rows &src_rows) { +svuint32_t inline calculate_linear_replicated_border( + svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf, + svuint32_t sv_src_stride, Rows &src_rows) { auto load_source = [&](svuint32_t x, svuint32_t y) { return load_common(pg, x, y, sv_src_stride, src_rows); }; diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp index 9debeda61..9d31832c3 100644 --- a/kleidicv/src/transform/remap_api.cpp +++ b/kleidicv/src/transform/remap_api.cpp @@ -26,3 +26,7 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u16, KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u8, &kleidicv::neon::remap_f32, &kleidicv::sve2::remap_f32, nullptr); + +KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u16, + &kleidicv::neon::remap_f32, nullptr, + nullptr); diff --git a/kleidicv/src/transform/remap_neon.cpp b/kleidicv/src/transform/remap_neon.cpp index c671c1d9e..d61561777 100644 --- a/kleidicv/src/transform/remap_neon.cpp +++ b/kleidicv/src/transform/remap_neon.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include // TODO: check #include #include "kleidicv/kleidicv.h" @@ -1045,6 +1044,148 @@ class RemapF32Replicate { uint32x4_t v_ymax_; }; // end of class RemapF32Replicate +template +class RemapF32Replicate { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t + + RemapF32Replicate(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u32( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + vq_src_element_stride_{vdupq_n_u32( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_u32(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_u32(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapx, + Columns mapy, Columns dst) { + const size_t kStep = VecTraits::num_lanes(); + + auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { + uint32x4_t offset = vmlaq_u32(x, y, vq_src_element_stride_); + uint64_t acc = + static_cast(src_rows_[vgetq_lane_u32(offset, 0)]) | + (static_cast(src_rows_[vgetq_lane_u32(offset, 1)]) << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = static_cast(src_rows_[vgetq_lane_u32(offset, 2)]) | + (static_cast(src_rows_[vgetq_lane_u32(offset, 3)]) << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + }; + + auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) { + uint64x2_t offset_low = vmlal_u32(vmovl_u32(vget_low_u32(x)), + vget_low_u32(y), v_src_element_stride_); + uint64x2_t offset_high = + vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_element_stride_); + uint64_t acc = + static_cast(src_rows_[vgetq_lane_u64(offset_low, 0)]) | + (static_cast(src_rows_[vgetq_lane_u64(offset_low, 1)]) + << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = static_cast(src_rows_[vgetq_lane_u64(offset_high, 0)]) | + (static_cast(src_rows_[vgetq_lane_u64(offset_high, 1)]) + << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + }; + + auto load = [&](uint32x4_t x, uint32x4_t y) { + if constexpr (IsLarge) { + return load_src_into_floats_large(x, y); + } else { + return load_src_into_floats_small(x, y); + } + }; + + auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { + MapVectorType x = vld1q_f32(ptr_mapx); + MapVectorType y = vld1q_f32(ptr_mapy); + // Truncating convert to int + uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_); + uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_); + + // Get fractional part, or 0 if out of range + float32x4_t zero = vdupq_n_f32(0.F); + uint32x4_t x_in_range = + vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_)); + uint32x4_t y_in_range = + vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_)); + float32x4_t xfrac = + vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero); + float32x4_t yfrac = + vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero); + + // x1 = x0 + 1, except if it's already xmax or out of range + uint32x4_t x1 = vsubq_u32(x0, x_in_range); + uint32x4_t y1 = vsubq_u32(y0, y_in_range); + + // Calculate offsets from coordinates (y * stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + float32x4_t a = load(x0, y0); + float32x4_t b = load(x1, y0); + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t c = load(x0, y1); + float32x4_t d = load(x1, y1); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vminq_u32(vdupq_n_u32(0xFFFF), vcvtaq_u32_f32(result)); + }; + + auto vector_path_2 = [&](size_t step) { // step = 2*4 = 8 + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); + + ptr_mapx += kStep; + ptr_mapy += kStep; + uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); + uint16x8_t result16 = vuzp1q_u16(res0, res1); + + vst1q_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, kStep}; + loop.unroll_twice(vector_path_2); + loop.unroll_once([&](size_t step) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapx -= back_step; + mapy -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + }); + } + + private: + Rows src_rows_; + uint32x2_t v_src_element_stride_; // load_large + uint32x4_t vq_src_element_stride_; // load_small + uint32x4_t v_xmax_; + uint32x4_t v_ymax_; +}; // end of class RemapF32Replicate + template class RemapF32ConstantBorder; @@ -1212,14 +1353,171 @@ class RemapF32ConstantBorder { }; // end of class RemapF32ConstantBorder // NOLINTEND(readability-function-cognitive-complexity) +// TODO: Need to refactor to reduce the complexity +// NOLINTBEGIN(readability-function-cognitive-complexity) +template +class RemapF32ConstantBorder { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t + + RemapF32ConstantBorder(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + src_width_{src_width}, + src_height_{src_height}, + border_value_{border_value} {} + + void process_row(size_t width, Columns mapx, + Columns mapy, Columns dst) { + const size_t kStep = VecTraits::num_lanes(); + + auto get_edge_pixels = [&](unsigned &a_result, unsigned &b_result, + unsigned &c_result, unsigned &d_result, int x0, + int y0, ptrdiff_t index, + Rows src_rows, int src_width, + int src_height) { + if (y0 >= 0) { + if (x0 >= 0) { + a_result = src_rows[index]; + } + if (x0 + 1 < src_width) { + b_result = src_rows[index + 1]; + } + } + if (y0 + 1 < src_height) { + index += static_cast(src_rows.stride() / sizeof(ScalarType)); + if (x0 >= 0) { + c_result = src_rows[index]; + } + if (x0 + 1 < src_width) { + d_result = src_rows[index + 1]; + } + } + }; + + auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { + MapVectorType xf = vld1q_f32(ptr_mapx); + MapVectorType yf = vld1q_f32(ptr_mapy); + // Convert obviously out-of-range coordinates to values that are just + // beyond the largest permitted image width & height. This avoids the need + // for special case handling elsewhere. + float32x4_t big = vdupq_n_f32(1 << 24); + xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big); + yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big); + + int32x4_t x0 = vcvtmq_s32_f32(xf); + int32x4_t y0 = vcvtmq_s32_f32(yf); + int x0_array[4], y0_array[4]; + unsigned a_array[4], b_array[4], c_array[4], d_array[4]; + vst1q_s32(x0_array, x0); + vst1q_s32(y0_array, y0); + for (int i = 0; i < 4; ++i) { + int x0i = x0_array[i]; + int y0i = y0_array[i]; + ptrdiff_t index = + x0i + y0i * static_cast(src_rows_.stride() / + sizeof(ScalarType)); + // std::cout << "x0i " << x0i << " y0i " << y0i << " index: " + // << index + // xw << "\n"; + // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed + if (x0i < 0 || x0i + 1 >= static_cast(src_width_) || y0i < 0 || + y0i + 1 >= static_cast(src_height_)) { + // Not entirely within the source image + + a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0]; + + if (x0i < -1 || x0i >= static_cast(src_width_) || y0i < -1 || + y0i >= static_cast(src_height_)) { + // Completely outside the source image + continue; + } + + get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, + y0i, index, src_rows_, src_width_, src_height_); + continue; + } + + // Completely inside the source image + a_array[i] = src_rows_[index]; + b_array[i] = src_rows_[index + 1]; + index += src_rows_.stride() / sizeof(ScalarType); + c_array[i] = src_rows_[index]; + d_array[i] = src_rows_[index + 1]; + } + + float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array)); + float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array)); + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array)); + float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array)); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vcvtaq_u32_f32(result); + }; + + auto vector_path_2 = [&](size_t step) { // step = 2*4 = 8 + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); + + ptr_mapx += kStep; + ptr_mapy += kStep; + uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); + uint16x8_t result16 = vuzp1q_u16(res0, res1); + + vst1q_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, kStep}; + loop.unroll_twice(vector_path_2); + loop.unroll_once([&](size_t step) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + mapx += ptrdiff_t(step); + mapy += ptrdiff_t(step); + dst += ptrdiff_t(step); + }); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapx -= back_step; + mapy -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t) { + const float *ptr_mapx = &mapx[0]; + const float *ptr_mapy = &mapy[0]; + uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(&dst[0], result16); + }); + } + + private: + Rows src_rows_; + size_t src_width_; + size_t src_height_; + const ScalarType *border_value_; +}; // end of class RemapF32ConstantBorder +// NOLINTEND(readability-function-cognitive-complexity) + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, [[maybe_unused]] const T *border_value) { @@ -1301,10 +1599,12 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32( \ const type *src, size_t src_stride, size_t src_width, size_t src_height, \ type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, float *mapx, size_t mapx_stride, float *mapy, \ - size_t mapy_stride, kleidicv_interpolation_type_t interpolation, \ + size_t channels, const float *mapx, size_t mapx_stride, \ + const float *mapy, size_t mapy_stride, \ + kleidicv_interpolation_type_t interpolation, \ kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t); } // namespace kleidicv::neon diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index 8cf574b91..d66bf34c4 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -858,7 +858,7 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, auto calculate_linear = [&](svbool_t pg, uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_replicate( + return calculate_linear_replicated_border( pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); @@ -925,8 +925,9 @@ template kleidicv_error_t remap_f32_sc(const T* src, size_t src_stride, size_t src_width, size_t src_height, T* dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, float* mapx, size_t mapx_stride, - float* mapy, size_t mapy_stride, + size_t channels, const float* mapx, + size_t mapx_stride, const float* mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, [[maybe_unused]] const T* border_value) { diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp index 624b7ce24..611eba8e1 100644 --- a/kleidicv/src/transform/remap_sve2.cpp +++ b/kleidicv/src/transform/remap_sve2.cpp @@ -37,15 +37,15 @@ template kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t channels, - float *mapx, size_t mapx_stride, float *mapy, - size_t mapy_stride, + const float *mapx, size_t mapx_stride, + const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value) { - return remap_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height, channels, - mapx, mapx_stride, mapy, mapy_stride, - interpolation, border_type, border_value); + return remap_f32_sc(src, src_stride, src_width, src_height, dst, + dst_stride, dst_width, dst_height, channels, mapx, + mapx_stride, mapy, mapy_stride, interpolation, + border_type, border_value); } #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ @@ -73,8 +73,9 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32( \ const type *src, size_t src_stride, size_t src_width, size_t src_height, \ type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, float *mapx, size_t mapx_stride, float *mapy, \ - size_t mapy_stride, kleidicv_interpolation_type_t interpolation, \ + size_t channels, const float *mapx, size_t mapx_stride, \ + const float *mapy, size_t mapy_stride, \ + kleidicv_interpolation_type_t interpolation, \ kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp index 1f4c6dbf4..c42bc0ef2 100644 --- a/kleidicv/src/transform/warp_perspective_neon.cpp +++ b/kleidicv/src/transform/warp_perspective_neon.cpp @@ -299,7 +299,7 @@ void warp_perspective_operation(Rows src_rows, return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); }; - auto calculate_linear_replicate = [&](uint32_t x) { + auto calculate_linear_replicated_border = [&](uint32_t x) { auto load_floats = [&](uint32x4_t x, uint32x4_t y) { if constexpr (IsLarge) { return load_src_into_floats_large(x, y); @@ -341,7 +341,7 @@ void warp_perspective_operation(Rows src_rows, auto calculate_linear = [&](uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - return calculate_linear_replicate(x); + return calculate_linear_replicated_border(x); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); auto &&[xf, yf] = calculate_coordinates(x); diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h index 9656d318c..d75958b86 100644 --- a/kleidicv/src/transform/warp_perspective_sc.h +++ b/kleidicv/src/transform/warp_perspective_sc.h @@ -171,7 +171,7 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, auto calculate_linear = [&](svbool_t pg, uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_replicate( + return calculate_linear_replicated_border( pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index dfcdc7cae..76b7a8bae 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -435,11 +435,23 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u16( kleidicv_error_t kleidicv_thread_remap_f32_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, float *mapx, size_t mapx_stride, float *mapy, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_remap_f32_u16 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_remap_f32_u16( + const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, const uint16_t *border_value, + kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_warp_perspective_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 2304946d4..b8368e013 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -760,7 +760,7 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u16( kleidicv_error_t kleidicv_thread_remap_f32_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, float *mapx, size_t mapx_stride, float *mapy, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading mt) { @@ -782,6 +782,31 @@ kleidicv_error_t kleidicv_thread_remap_f32_u8( return parallel_batches(callback, mt, dst_height); } +kleidicv_error_t kleidicv_thread_remap_f32_u16( + const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, + size_t mapy_stride, kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, const uint16_t *border_value, + kleidicv_thread_multithreading mt) { + if (!kleidicv::remap_f32_is_implemented( + src_stride, src_width, src_height, dst_width, border_type, channels, + interpolation)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_remap_f32_u16( + src, src_stride, src_width, src_height, + dst + static_cast(begin * dst_stride / sizeof(uint16_t)), + dst_stride, dst_width, end - begin, channels, + mapx + static_cast(begin * mapx_stride / sizeof(float)), + mapx_stride, + mapy + static_cast(begin * mapy_stride / sizeof(float)), + mapy_stride, interpolation, border_type, border_value); + }; + return parallel_batches(callback, mt, dst_height); +} + kleidicv_error_t kleidicv_thread_warp_perspective_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh index d98b8c9c4..07fe62004 100755 --- a/scripts/ci-opencv.sh +++ b/scripts/ci-opencv.sh @@ -104,6 +104,7 @@ IMGPROC_TEST_PATTERNS=( '*Imgproc_Dilate*' '*Imgproc_Erode*' '*Imgproc_PyramidDown*' +# ImgProc_RemapRelative is not implemented so it's omitted '*Imgproc_Remap.*' '*Imgproc_Remap_Test*' '*Imgproc_Warp*' diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 72de4a4bb..72975a21f 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -26,6 +26,12 @@ KLEIDICV_REMAP_S16(uint16_t, u16); KLEIDICV_REMAP_S16POINT5(uint8_t, u8); KLEIDICV_REMAP_S16POINT5(uint16_t, u16); +#define KLEIDICV_REMAP_F32(type, type_suffix) \ + KLEIDICV_API(remap_f32, kleidicv_remap_f32_##type_suffix, type) + +KLEIDICV_REMAP_F32(uint8_t, u8); +KLEIDICV_REMAP_F32(uint16_t, u16); + template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, @@ -560,7 +566,7 @@ class RemapS16Point5 : public testing::Test { mapxy.data(), mapxy.stride(), mapfrac.data(), mapfrac.stride(), border_type, border_value)); - EXPECT_EQ_ARRAY2D(actual, expected); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } private: @@ -858,12 +864,11 @@ class RemapF32 : public testing::Test { size_t dst_h, size_t channels, kleidicv_border_type_t border_type, const ScalarType *border_value, size_t padding) { + test::PseudoRandomNumberGenerator coord_generator; test::Array2D mapx(dst_w, dst_h, padding); - test::PseudoRandomNumberGenerator xcoord_generator; - mapx.fill(xcoord_generator); test::Array2D mapy(dst_w, dst_h, padding); - test::PseudoRandomNumberGenerator ycoord_generator; - mapy.fill(ycoord_generator); + mapx.fill(coord_generator); + mapy.fill(coord_generator); execute_test(mapx, mapy, src_w, src_h, dst_w, dst_h, channels, border_type, border_value, padding); } @@ -949,8 +954,8 @@ class RemapF32 : public testing::Test { } } - // This part is the same as execute_test() but without initializing source. - // Corner Cases use the biggest possible source. + // This part is the same as execute_test() but without initializing the + // whole source. Corner Cases use the biggest possible source. size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -959,9 +964,12 @@ class RemapF32 : public testing::Test { test::Array2D expected{dst_total_width, dst_h, padding, channels}; - const int64_t kMaxVal = std::numeric_limits::max(); + // Initalize the edges only + const int64_t kMaxVal = std::numeric_limits::max() * 3 / 4; + const int64_t kMinVal = + std::numeric_limits::lowest() + kMaxVal / 3; auto generateSource = [&](size_t x, size_t y) { - return static_cast((x + y) % 2 ? kMaxVal : 27); + return static_cast((x + y) % 2 ? kMaxVal : kMinVal); }; for (size_t y = 0; y < src_h; ++y) { *source.at(y, 0) = generateSource(y, 0); @@ -987,13 +995,28 @@ class RemapF32 : public testing::Test { ASSERT_EQ( KLEIDICV_OK, - kleidicv_remap_f32_u8( + remap_f32()( source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(), KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value)); - EXPECT_EQ_ARRAY2D(actual, expected); + if (expected.compare_to(actual, 1)) { + if (source.width() < 100 && source.height() < 100) { + std::cout << "source:\n"; + dump(&source); + } + std::cout << "mapx:\n"; + dump(&mapx); + std::cout << "mapy:\n"; + dump(&mapy); + std::cout << "expected:\n"; + dump(&expected); + std::cout << "actual:\n"; + dump(&actual); + } + + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } private: @@ -1017,13 +1040,28 @@ class RemapF32 : public testing::Test { ASSERT_EQ( KLEIDICV_OK, - kleidicv_remap_f32_u8( + remap_f32()( source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(), KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value)); - EXPECT_EQ_ARRAY2D(actual, expected); + if (expected.compare_to(actual, 1)) { + if (source.width() < 100 && source.height() < 100) { + std::cout << "source:\n"; + dump(&source); + } + std::cout << "mapx:\n"; + dump(&mapx); + std::cout << "mapy:\n"; + dump(&mapy); + std::cout << "expected:\n"; + dump(&expected); + std::cout << "actual:\n"; + dump(&actual); + } + + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } static void calculate_expected(test::Array2D &src, @@ -1043,8 +1081,6 @@ class RemapF32 : public testing::Test { for (size_t ch = 0; ch < src.channels(); ++ch) { float x = *mapx.at(row, column); float y = *mapy.at(row, column); - // ptrdiff_t ix = std::floor(x); - // ptrdiff_t iy = std::floor(y); ptrdiff_t ix = static_cast(std::max( INT_MIN, std::min(std::floor(x), @@ -1070,7 +1106,7 @@ class RemapF32 : public testing::Test { } }; -using RemapF32ElementTypes = ::testing::Types; +using RemapF32ElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapF32, RemapF32ElementTypes); TYPED_TEST(RemapF32, RandomNoPadding) { @@ -1169,7 +1205,7 @@ TYPED_TEST(RemapF32, NullPointer) { float mapy[1] = {}; const size_t mapy_stride = dst_width * sizeof(float); const TypeParam border_value[1] = {}; - test::test_null_args(kleidicv_remap_f32_u8, src, src_stride, src_width, + test::test_null_args(remap_f32(), src, src_stride, src_width, src_height, dst, dst_stride, dst_width, dst_height, channels, mapx, mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, @@ -1187,15 +1223,15 @@ TYPED_TEST(RemapF32, ZeroImageSize) { const size_t mapy_stride = sizeof(float); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, src_stride, 0, 1, dst, dst_stride, 0, 1, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, src_stride, 0, 1, dst, dst_stride, 0, 1, + 1, mapx, mapx_stride, mapy, mapy_stride, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, src_stride, 1, 0, dst, dst_stride, 1, 0, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, src_stride, 1, 0, dst, dst_stride, 1, 0, + 1, mapx, mapx_stride, mapy, mapy_stride, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, InvalidImageSize) { @@ -1207,32 +1243,32 @@ TYPED_TEST(RemapF32, InvalidImageSize) { EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, - dst, element_size, 1, 1, 1, mapx, sizeof(float), - mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS + 1, + 1, dst, element_size, 1, 1, 1, mapx, sizeof(float), + mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS, - KLEIDICV_MAX_IMAGE_PIXELS, dst, element_size, - 1, 1, 1, mapx, sizeof(float), mapy, - sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS, dst, element_size, + 1, 1, 1, mapx, sizeof(float), mapy, + sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, element_size, - KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapx, - sizeof(float), mapy, sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, element_size, + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapx, + sizeof(float), mapy, sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, element_size, - KLEIDICV_MAX_IMAGE_PIXELS, - KLEIDICV_MAX_IMAGE_PIXELS, 1, mapx, sizeof(float), - mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, element_size, + KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS, 1, mapx, sizeof(float), + mapy, sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedTwoChannels) { @@ -1245,10 +1281,10 @@ TYPED_TEST(RemapF32, UnsupportedTwoChannels) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16, - 1, channels, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 16, 1, channels, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedInterpolationTypeNEAREST) { @@ -1260,10 +1296,10 @@ TYPED_TEST(RemapF32, UnsupportedInterpolationTypeNEAREST) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16, - 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 16, 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_NEAREST, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedTooSmallImage) { @@ -1275,10 +1311,10 @@ TYPED_TEST(RemapF32, UnsupportedTooSmallImage) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 3, - 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, 3, + 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigStride) { @@ -1292,10 +1328,10 @@ TYPED_TEST(RemapF32, UnsupportedBigStride) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, src_stride, 1, 1, dst, 16 * element_size, 16, - 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, src_stride, 1, 1, dst, 16 * element_size, 16, + 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) { @@ -1306,18 +1342,18 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) { float mapy[16] = {}; EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1ULL << 24, 1, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1ULL << 24, 1, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, (1ULL << 32) + 1, 1, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, (1ULL << 32) + 1, 1, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) { @@ -1328,18 +1364,18 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) { float mapy[16] = {}; EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1ULL << 24, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1ULL << 24, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_f32_u8(src, element_size, 1, (1ULL << 32) + 1, dst, - 16 * element_size, 16, 1, 1, mapx, - 16 * sizeof(float), mapy, 16 * sizeof(float), - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, (1ULL << 32) + 1, dst, + 16 * element_size, 16, 1, 1, mapx, + 16 * sizeof(float), mapy, 16 * sizeof(float), + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) { @@ -1351,10 +1387,10 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) { EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, - 1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) { @@ -1366,18 +1402,41 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) { EXPECT_EQ( KLEIDICV_ERROR_RANGE, - kleidicv_remap_f32_u8(src, element_size, 1, 1, dst, 16 * element_size, 16, - 1ULL << 24, 1, mapx, 16 * sizeof(float), mapy, - 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, + 16, 1ULL << 24, 1, mapx, 16 * sizeof(float), mapy, + 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapF32, Misalignment) { - const size_t element_size = sizeof(TypeParam); - if (element_size == 1) { - // misalignment impossible - GTEST_SKIP(); + const TypeParam src[8] = {}; + TypeParam dst[8]; + const size_t data_stride_ok = sizeof(TypeParam); + const size_t data_stride_nok = sizeof(TypeParam) + 1; + float mapx[8] = {}; + float mapy[8] = {}; + const size_t map_stride_ok = sizeof(float); + const size_t map_stride_nok = sizeof(float) + 1; + auto I = KLEIDICV_INTERPOLATION_LINEAR; + auto B = KLEIDICV_BORDER_TYPE_REPLICATE; + + // Is misalignment of the data possible? + if (data_stride_ok != 1) { + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()( + src, data_stride_nok, 4, 2, dst, data_stride_ok, 4, 2, 1, + mapx, map_stride_ok, mapy, map_stride_ok, I, B, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()( + src, data_stride_ok, 4, 2, dst, data_stride_nok, 4, 2, 1, + mapx, map_stride_ok, mapy, map_stride_ok, I, B, nullptr)); } - - // Will be needed when supporting uint16_t + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()( + src, data_stride_nok, 4, 2, dst, data_stride_ok, 4, 2, 1, mapx, + map_stride_nok, mapy, map_stride_ok, I, B, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + remap_f32()(src, data_stride_ok, 4, 2, dst, + data_stride_ok, 4, 2, 1, mapx, map_stride_ok, + mapy, map_stride_nok, I, B, nullptr)); } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index c930b4757..c1a53f6da 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -788,6 +788,33 @@ TEST_P(Thread, remap_f32_u8_not_implemented) { KLEIDICV_BORDER_TYPE_REFLECT, &border_value); } +TEST_P(Thread, remap_f32_u16_border_replicate) { + check_remap_f32( + kleidicv_remap_f32_u16, kleidicv_thread_remap_f32_u16, 1, + KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + +TEST_P(Thread, remap_f32_u16_border_constant) { + const uint16_t border_value = 0; + check_remap_f32(kleidicv_remap_f32_u16, + kleidicv_thread_remap_f32_u16, 1, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + +TEST_P(Thread, remap_f32_u16_not_implemented) { + const uint16_t border_value = 0; + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 2, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_NEAREST, + KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REFLECT, &border_value); +} + TEST_P(Thread, warp_perspective_u8_border_replicate) { const uint8_t border_value = 0; check_warp_perspective( -- GitLab From 61d3ff6e057d2df320365480a87f9e206e47571a Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 12 Feb 2025 10:17:06 +0000 Subject: [PATCH 2/3] Implement RemapF32 for u16 data, for SVE2 --- kleidicv/src/transform/common_sc.h | 57 +++++++++++++++++-------- kleidicv/src/transform/remap_api.cpp | 4 +- kleidicv/src/transform/remap_sc.h | 61 ++++++++++++++++++--------- kleidicv/src/transform/remap_sve2.cpp | 1 + test/api/test_remap.cpp | 48 ++++++++++++--------- 5 files changed, 113 insertions(+), 58 deletions(-) diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/common_sc.h index 1520dfd5c..17ac56866 100644 --- a/kleidicv/src/transform/common_sc.h +++ b/kleidicv/src/transform/common_sc.h @@ -68,24 +68,45 @@ template svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y, svuint32_t sv_src_stride, Rows &src_rows) { - if constexpr (IsLarge) { - svbool_t pg_b = pg; - svbool_t pg_t = svtrn2_b32(pg, svpfalse()); - - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits - svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); - svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); - // Copy pixels from source - svuint64_t result_b = - svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); - svuint64_t result_t = - svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); - return svtrn1_u32(svreinterpret_u32_u64(result_b), - svreinterpret_u32_u64(result_t)); - } else { - svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); - return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); + if constexpr (std::is_same::value) { + if constexpr (IsLarge) { + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b = + svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + svuint64_t result_t = + svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + } else { + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); + return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); + } + } else if constexpr (std::is_same::value) { + if constexpr (IsLarge) { + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b, result_t; + result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + } else { + svuint32_t offsets = + svmla_x(pg, svlsl_n_u32_x(pg, x, 1), y, sv_src_stride); + return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets); + } } } diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp index 9d31832c3..28ed84d3c 100644 --- a/kleidicv/src/transform/remap_api.cpp +++ b/kleidicv/src/transform/remap_api.cpp @@ -28,5 +28,5 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u8, &kleidicv::sve2::remap_f32, nullptr); KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_f32_u16, - &kleidicv::neon::remap_f32, nullptr, - nullptr); + &kleidicv::neon::remap_f32, + &kleidicv::sve2::remap_f32, nullptr); diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index d66bf34c4..dc52a2931 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -821,8 +821,8 @@ kleidicv_error_t remap_s16point5_sc( } return KLEIDICV_OK; } -// NOLINTEND(readability-function-cognitive-complexity) +// TODO reduce functional complexity template void remap32f_process_rows(Rows src_rows, size_t src_width, @@ -876,33 +876,55 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, assert(!"INTER_NEAREST not implemented for RemapF32"); // GCOVR_EXCL_STOP } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { - loop.unroll_four_times([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - x += kStep; - res0 = calculate_linear(pg_all32, x); - x += kStep; - res1 = calculate_linear(pg_all32, x); - svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + if constexpr (std::is_same::value) { + loop.unroll_four_times([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + x += kStep; + res0 = calculate_linear(pg_all32, x); + x += kStep; + res1 = calculate_linear(pg_all32, x); + svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + svst1_u8(svptrue_b8(), p_dst, + svuzp1_u8(svreinterpret_u8_u16(result16_0), + svreinterpret_u8_u16(result16_1))); + }); + } else if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16 = svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); - svst1_u8(svptrue_b8(), p_dst, - svuzp1_u8(svreinterpret_u8_u16(result16_0), - svreinterpret_u8_u16(result16_1))); - }); + svst1_u16(svptrue_b16(), p_dst, result16); + }); + } loop.unroll_once([&](size_t x) { ScalarType* p_dst = &dst[static_cast(x)]; svuint32_t result = calculate_linear(pg_all32, x); - svst1b_u32(pg_all32, p_dst, result); + if constexpr (std::is_same::value) { + svst1b_u32(pg_all32, p_dst, result); + } + if constexpr (std::is_same::value) { + svst1h_u32(pg_all32, p_dst, result); + } }); loop.remaining([&](size_t x, size_t x_max) { ScalarType* p_dst = &dst[static_cast(x)]; svbool_t pg32 = svwhilelt_b32(x, x_max); svuint32_t result = calculate_linear(pg32, x); - svst1b_u32(pg32, p_dst, result); + if constexpr (std::is_same::value) { + svst1b_u32(pg32, p_dst, result); + } + if constexpr (std::is_same::value) { + svst1h_u32(pg32, p_dst, result); + } }); } else { static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || @@ -918,6 +940,7 @@ void remap32f_process_rows(Rows src_rows, size_t src_width, ++dst_rows; } } +// NOLINTEND(readability-function-cognitive-complexity) // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp index 611eba8e1..155b8d88c 100644 --- a/kleidicv/src/transform/remap_sve2.cpp +++ b/kleidicv/src/transform/remap_sve2.cpp @@ -79,5 +79,6 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t); } // namespace kleidicv::sve2 diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 72975a21f..5966dd191 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -1212,26 +1212,36 @@ TYPED_TEST(RemapF32, NullPointer) { KLEIDICV_BORDER_TYPE_CONSTANT, border_value); } -TYPED_TEST(RemapF32, ZeroImageSize) { - const TypeParam src[1] = {}; - TypeParam dst[1]; - const size_t src_stride = sizeof(TypeParam); - const size_t dst_stride = sizeof(TypeParam); - float mapx[1] = {}; - float mapy[1] = {}; - const size_t mapx_stride = sizeof(float); - const size_t mapy_stride = sizeof(float); +TYPED_TEST(RemapF32, ZeroHeightImage) { + const size_t kW = 4; + const TypeParam src[kW] = {}; + TypeParam dst[kW]; + const size_t src_stride = kW * sizeof(TypeParam); + const size_t big_stride = (1UL << 32UL) - sizeof(TypeParam); + const size_t dst_stride = kW * sizeof(TypeParam); + float mapx[kW] = {}; + float mapy[kW] = {}; + const size_t mapx_stride = kW * sizeof(float); + const size_t mapy_stride = kW * sizeof(float); - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_f32()(src, src_stride, 0, 1, dst, dst_stride, 0, 1, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_f32()(src, src_stride, 1, 0, dst, dst_stride, 1, 0, - 1, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + for (auto [border_type, border_value] : get_borders()) { + EXPECT_EQ(KLEIDICV_OK, + remap_f32()(src, src_stride, kW, 1, dst, dst_stride, + kW, 0, 1, mapx, mapx_stride, mapy, + mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, + border_type, border_value)); + EXPECT_EQ(KLEIDICV_OK, + remap_f32()(src, big_stride, kW, 2, dst, dst_stride, + kW, 0, 1, mapx, mapx_stride, mapy, + mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, + border_type, border_value)); + } + const TypeParam border_value[1] = {0}; + EXPECT_EQ(KLEIDICV_OK, + remap_f32()( + src, src_stride, kW, 0, dst, dst_stride, kW, 1, 1, mapx, + mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, border_value)); } TYPED_TEST(RemapF32, InvalidImageSize) { -- GitLab From fe4057aeaedf1376fed60ce61e847490c95c64d5 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Wed, 12 Feb 2025 12:49:26 +0000 Subject: [PATCH 3/3] Add documentation and benchmarks for Remap F32 U16 --- CHANGELOG.md | 2 +- benchmark/benchmark.cpp | 177 ++++++++++++++++++++++++++- doc/functionality.md | 2 +- doc/opencv.md | 2 +- kleidicv/include/kleidicv/kleidicv.h | 2 +- scripts/benchmark/benchmarks.txt | 11 +- 6 files changed, 186 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d55cb571f..87be149c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ This changelog aims to follow the guiding principles of - Floating-point coordinates with linear interpolation - Replicated and constant borders - 1-channel only - - u8 and u16 inputs + - u8 and u16 images - WarpPerspective implementation - Nearest and Linear interpolation method, for 1-channel u8 input. diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 9d8e980e4..c61ef14a4 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -518,11 +518,55 @@ static const ScalarType* get_random_mapxy() { return mapxy.data(); } +template +static const ScalarType* get_random_mapx() { + auto generate_mapx = [&]() { + // Prevent KleidiCV from flattening the image, it affects the performance + // Add 4 elements' padding, so the image won't be processed as a single row + const size_t image_stripe = image_width + 4; + std::vector v(image_height * image_stripe); + std::mt19937_64 rng; + std::uniform_int_distribution dist_x(0, image_width); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_stripe + column; + // Use a second degree function to add a nonlinear blend to the image + v[index] = dist_x(rng); + } + } + return v; + }; + static std::vector mapx = generate_mapx(); + return mapx.data(); +} + +template +static const ScalarType* get_random_mapy() { + auto generate_mapx = [&]() { + // Prevent KleidiCV from flattening the image, it affects the performance + // Add 4 elements' padding, so the image won't be processed as a single row + const size_t image_stripe = image_width + 4; + std::vector v(image_height * image_stripe); + std::mt19937_64 rng; + std::uniform_int_distribution dist_y(0, image_height); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_stripe + column; + // Use a second degree function to add a nonlinear blend to the image + v[index] = dist_y(rng); + } + } + return v; + }; + static std::vector mapy = generate_mapx(); + return mapy.data(); +} + template static const ScalarType* get_blend_mapxy() { auto generate_mapxy = [&]() { // Prevent KleidiCV from flattening the image, it affects the performance - // Add 4 bytes padding, so the image won't be processed as a single row + // Add 4 elements' padding, so the image won't be processed as a single row const size_t image_stripe = image_width + 4; std::vector v(image_height * image_stripe * 2); for (int row = 0; row < static_cast(image_height); ++row) { @@ -542,6 +586,51 @@ static const ScalarType* get_blend_mapxy() { return mapxy.data(); } +template +static const ScalarType* get_blend_mapx() { + auto generate_mapx = [&]() { + // Prevent KleidiCV from flattening the image, it affects the performance + // Add 4 elements' padding, so the image won't be processed as a single row + const size_t image_stripe = image_width + 4; + std::vector v(image_height * image_stripe); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_stripe + column; + // Use a second degree function to add a nonlinear blend to the image + v[index] = static_cast( + column * 2 - + column * column / static_cast(image_width)); + } + } + return v; + }; + static std::vector mapx = generate_mapx(); + return mapx.data(); +} + +template +static const ScalarType* get_blend_mapy() { + auto generate_mapx = [&]() { + // Prevent KleidiCV from flattening the image, it affects the performance + // Add 4 elements' padding, so the image won't be processed as a single row + const size_t image_stripe = image_width + 4; + std::vector v(image_height * image_stripe); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_stripe + column; + // Use a second degree function to add a nonlinear blend to the image + v[index] = static_cast( + row * (image_width - column) / + static_cast(image_width) + + 4 * row / static_cast(image_height)); + } + } + return v; + }; + static std::vector mapy = generate_mapx(); + return mapy.data(); +} + template static const ScalarType* get_flip_mapxy() { auto generate_mapxy = [&]() { @@ -562,6 +651,44 @@ static const ScalarType* get_flip_mapxy() { return mapxy.data(); } +template +static const ScalarType* get_flip_mapx() { + auto generate_mapx = [&]() { + // Prevent KleidiCV from flattening the image, it affects the performance + // Add 4 elements' padding, so the image won't be processed as a single row + const size_t image_stripe = image_width + 4; + std::vector v(image_height * image_stripe); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_stripe + column; + v[index] = image_width - column - 0.3F; + } + } + return v; + }; + static std::vector mapx = generate_mapx(); + return mapx.data(); +} + +template +static const ScalarType* get_flip_mapy() { + auto generate_mapy = [&]() { + // Prevent KleidiCV from flattening the image, it affects the performance + // Add 4 elements' padding, so the image won't be processed as a single row + const size_t image_stripe = image_width + 4; + std::vector v(image_height * image_stripe); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_stripe + column; + v[index] = row + 0.23F; + } + } + return v; + }; + static std::vector mapy = generate_mapy(); + return mapy.data(); +} + template static const ScalarType* get_identity_mapxy() { auto generate_mapxy = [&]() { @@ -706,6 +833,54 @@ BENCH_REMAP_S16POINT5(remap_s16point5_u16_identity, remap_s16point5_u16, get_identity_mapxy, 1, KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); +template +static void remap_f32(Function f, MapFuncX mfx, MapFuncY mfy, size_t channels, + kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, + benchmark::State& state) { + const T border_value[4] = {}; + bench_functor(state, [f, mfx, mfy, channels, interpolation, border_type, + border_value]() { + (void)f(get_source_buffer_a(), image_width * sizeof(T), image_width, + image_height, get_destination_buffer(), image_width * sizeof(T), + image_width, image_height, channels, mfx(), + image_width * sizeof(float), mfy(), image_width * sizeof(float), + interpolation, border_type, border_value); + }); +} + +#define BENCH_REMAP_F32(benchname, name, mapxfunc, mapyfunc, channels, \ + interpolation, border_type, type) \ + static void benchname(benchmark::State& state) { \ + remap_f32(kleidicv_##name, mapxfunc, mapyfunc, channels, \ + interpolation, border_type, state); \ + } \ + BENCHMARK(benchname) + +BENCH_REMAP_F32(remap_f32_u8_random, remap_f32_u8, get_random_mapx, + get_random_mapy, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_REMAP_F32(remap_f32_u8_blend, remap_f32_u8, get_blend_mapx, + get_blend_mapy, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_REMAP_F32(remap_f32_u8_flip, remap_f32_u8, get_flip_mapx, + get_flip_mapy, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_REMAP_F32(remap_f32_u16_random, remap_f32_u16, get_random_mapx, + get_random_mapy, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + +BENCH_REMAP_F32(remap_f32_u16_blend, remap_f32_u16, get_blend_mapx, + get_blend_mapy, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + +BENCH_REMAP_F32(remap_f32_u16_flip, remap_f32_u16, get_flip_mapx, + get_flip_mapy, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + // clang-format off static const float transform_identity[] = { 1.0, 0, 0, diff --git a/doc/functionality.md b/doc/functionality.md index ab124b7cc..83c8fdc71 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -97,7 +97,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. |--------------------------------------------|-----|-----| | Remap int16 coordinates | x | x | | Remap int16+uint16 fixed-point coordinates | x | x | -| Remap float32 coordinates | x | | +| Remap float32 coordinates | x | x | # WarpPerspective | | u8 | diff --git a/doc/opencv.md b/doc/opencv.md index 2c88467c3..2ddae12c6 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -209,7 +209,7 @@ Notes on parameters: * `src.step` - must be less than 65536 * element size. * `src.width`, `src_height` - must not be greater than 32768. * `src.type()` - supports `CV_8UC1` and `CV_16UC1`. -* `dst.cols` - must be at least 8 +* `dst.cols` - must be at least 4 (32FC1-type maps) or 8 (16SC2-type maps) * `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. \ Supported map configurations: * `map1.type()` is `CV_16SC2` and `map2` is empty: diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index e8bf0d008..094cc8c41 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1860,7 +1860,7 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src, /// least 4. /// @param dst_height Number of rows in the destination data. /// @param channels Number of channels in the (source and destination) -/// data. Must be 1. +/// data. Must be 1. /// @param mapx Pointer to the x coordinates' data. Must be non-null. /// @param mapx_stride Distance in bytes from the start of one row to the /// start of the next row for `mapx`. Must be a multiple diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index fa7514a17..a8dc74e71 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -79,16 +79,17 @@ InRange_U8: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 8UC1, 1, 2)' InRange_F32: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 32FC1, 1, 2)' Remap_S16_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' -Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' -Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' -Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' -Remap_F32_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' - Remap_S16_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' +Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' +Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' +Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' +Remap_F32_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' Remap_F32_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' +Remap_F32_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' +Remap_F32_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 1)' WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 1)' -- GitLab