From 3e45dae766b76c96827f4768069aa8fa0dd47c8c Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Thu, 27 Feb 2025 17:44:51 +0000 Subject: [PATCH] Implement Remap Float32 for 2 channels --- CHANGELOG.md | 2 +- adapters/opencv/kleidicv_hal.cpp | 13 +- conformity/opencv/test_remap.cpp | 25 +- doc/functionality.md | 11 +- doc/opencv.md | 5 +- kleidicv/include/kleidicv/kleidicv.h | 2 +- kleidicv/include/kleidicv/transform/remap.h | 22 +- kleidicv/include/kleidicv/types.h | 5 + kleidicv/src/transform/remap_f32_neon.cpp | 206 +++++---- kleidicv/src/transform/remap_f32_sve2.cpp | 367 ++++++++++------ kleidicv/src/transform/transform_common.h | 18 + kleidicv/src/transform/transform_neon.h | 414 ++++++++++++++---- kleidicv/src/transform/transform_sve2.h | 257 ++++++++--- .../src/transform/warp_perspective_neon.cpp | 21 +- .../src/transform/warp_perspective_sve2.cpp | 18 +- kleidicv_thread/src/kleidicv_thread.cpp | 8 +- scripts/benchmark/benchmarks.txt | 16 +- test/api/test_remap.cpp | 168 +++---- test/api/test_thread.cpp | 23 +- 19 files changed, 1096 insertions(+), 505 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d3213a9b..ffaf5232d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ This changelog aims to follow the guiding principles of - Fixed-point coordinates with linear interpolation - Floating-point coordinates with nearest neighbour and linear interpolation - Replicated and constant borders - - 1-channel only + - 1 channel source image for integer and fixed-points coordinates, 1 and 2 channels for floating-point coordinates - u8 and u16 images - WarpPerspective implementation - Nearest and Linear interpolation method, for 1-channel u8 input. diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index e73c350fa..8a897d1e4 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1393,24 +1393,25 @@ int remap_f32(int src_type, const uchar *src_data, size_t src_step, return CV_HAL_ERROR_NOT_IMPLEMENTED; } + size_t channels = (src_type >> CV_CN_SHIFT) + 1; auto mt = get_multithreading(); - if (src_type == CV_8UC1) { + if (CV_MAT_DEPTH(src_type) == CV_8U) { auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_f32_u8( src_data, src_step, static_cast(src_width), static_cast(src_height), dst_data, dst_step, - static_cast(dst_width), static_cast(dst_height), 1, - mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, + static_cast(dst_width), static_cast(dst_height), + channels, mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, kleidicv_border_type, border_value.data(), mt)); - } else if (src_type == CV_16UC1) { + } else if (CV_MAT_DEPTH(src_type) == CV_16UC1) { auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_f32_u16( reinterpret_cast(src_data), src_step, static_cast(src_width), static_cast(src_height), reinterpret_cast(dst_data), dst_step, - static_cast(dst_width), static_cast(dst_height), 1, - mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, + static_cast(dst_width), static_cast(dst_height), + channels, mapx, mapx_step, mapy, mapy_step, kleidicv_interpolation_type, kleidicv_border_type, border_value.data(), mt)); } diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index face699ec..f80e1ab3d 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -254,14 +254,23 @@ std::vector& remap_tests_get() { TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), - TEST("RemapF32 uint8 Replicate Linear", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint16 Replicate Linear", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint8 Constant Linear", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint16 Constant Linear", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint8 Replicate Nearest", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint16 Replicate Nearest", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint8 Constant Nearest", (test_remap_f32), (exec_remap_f32)), - TEST("RemapF32 uint16 Constant Nearest", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Replicate Linear 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Replicate Linear 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Constant Linear 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Constant Linear 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Replicate Nearest 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Replicate Nearest 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Constant Nearest 1ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Constant Nearest 1ch", (test_remap_f32), (exec_remap_f32)), + + TEST("RemapF32 uint8 Replicate Linear 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Replicate Linear 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Constant Linear 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Constant Linear 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Replicate Nearest 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Replicate Nearest 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint8 Constant Nearest 2ch", (test_remap_f32), (exec_remap_f32)), + TEST("RemapF32 uint16 Constant Nearest 2ch", (test_remap_f32), (exec_remap_f32)), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 83c8fdc71..3d178946f 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -93,11 +93,12 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | 8x8 | | x | # Remap -| | u8 | u16 | -|--------------------------------------------|-----|-----| -| Remap int16 coordinates | x | x | -| Remap int16+uint16 fixed-point coordinates | x | x | -| Remap float32 coordinates | x | x | +| | 1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 | +|---------------------------------------------------|---------|---------|--------|---------| +| Remap int16 coordinates | x | x | | | +| Remap int16+uint16 fixed-point coordinates | x | x | | | +| Remap float32 coordinates - nearest interpolation | x | x | x | x | +| Remap float32 coordinates - linear interpolation | x | x | x | x | # WarpPerspective | | u8 | diff --git a/doc/opencv.md b/doc/opencv.md index a972dbdac..dfc154e09 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -208,9 +208,10 @@ Geometrically transforms the `src` image by taking the pixels specified by the c Notes on parameters: * `src.step` - must be less than 65536 * element size. * `src.width`, `src_height` - must not be greater than 32768. -* `src.type()` - supports `CV_8UC1` and `CV_16UC1`. +* `src.type()` - supports `CV_8UC1` and `CV_16UC1`. With `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well. * `dst.cols` - must be at least 4 (32FC1-type maps) or 8 (16SC2-type maps) -* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. \ +* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. + Supported map configurations: * `map1.type()` is `CV_16SC2` and `map2` is empty: * > ⚠️ **Acceleration will not work unless OpenCV is built from source patched with `opencv-4.11.patch`** diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 2bed6acfd..6a5ff2658 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1861,7 +1861,7 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src, /// least 4. /// @param dst_height Number of rows in the destination data. /// @param channels Number of channels in the (source and destination) -/// data. Must be 1. +/// data. Can be 1 or 2. /// @param mapx Pointer to the x coordinates' data. Must be non-null. /// @param mapx_stride Distance in bytes from the start of one row to the /// start of the next row for `mapx`. Must be a multiple diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 15b1654b6..ea72f5666 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -53,21 +53,19 @@ inline bool remap_s16point5_is_implemented( template inline bool remap_f32_is_implemented( size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, - kleidicv_border_type_t border_type, size_t channels, + size_t dst_height, kleidicv_border_type_t border_type, size_t channels, kleidicv_interpolation_type_t interpolation) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value || std::is_same::value) { - return ( - src_stride <= std::numeric_limits::max() && dst_width >= 4 && - src_width <= - static_cast(std::numeric_limits::max()) + 1 && - src_height <= - static_cast(std::numeric_limits::max()) + 1 && - (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || - border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && - channels == 1 && - (interpolation == KLEIDICV_INTERPOLATION_LINEAR || - interpolation == KLEIDICV_INTERPOLATION_NEAREST)); + return (src_stride <= std::numeric_limits::max() && + dst_width >= 4 && src_width < (1ULL << 24) && + src_height < (1ULL << 24) && dst_width < (1ULL << 24) && + dst_height < (1ULL << 24) && src_width > 0 && src_height > 0 && + (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || + border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && + (channels == 1 || channels == 2) && + (interpolation == KLEIDICV_INTERPOLATION_LINEAR || + interpolation == KLEIDICV_INTERPOLATION_NEAREST)); } else { return false; } diff --git a/kleidicv/include/kleidicv/types.h b/kleidicv/include/kleidicv/types.h index 831507833..e1588cc11 100644 --- a/kleidicv/include/kleidicv/types.h +++ b/kleidicv/include/kleidicv/types.h @@ -146,6 +146,11 @@ class Columns final { channels()}; } + // Returns a pointer to a given column. + [[nodiscard]] T *ptr_at(ptrdiff_t column) KLEIDICV_STREAMING_COMPATIBLE { + return ptr_ + column * static_cast(channels()); + } + // Returns the number of channels in a row. size_t channels() const KLEIDICV_STREAMING_COMPATIBLE { return channels_; } diff --git a/kleidicv/src/transform/remap_f32_neon.cpp b/kleidicv/src/transform/remap_f32_neon.cpp index 18f517c14..9996840fd 100644 --- a/kleidicv/src/transform/remap_f32_neon.cpp +++ b/kleidicv/src/transform/remap_f32_neon.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include "kleidicv/ctypes.h" @@ -13,46 +11,33 @@ namespace kleidicv::neon { -template -void remap_f32_nearest_replicate( - uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride, - Rows src_rows, Columns dst, size_t dst_width, - Columns mapx, Columns mapy, const size_t kStep) { - LoopUnroll2 loop{dst_width, kStep}; - loop.unroll_once([&](size_t x) { - transform_pixels_replicate( - vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride, - src_rows, dst.at(x)); - }); -} - -template -void remap_f32_nearest_constant(uint32x4_t v_xmax, uint32x4_t v_ymax, - uint32x4_t v_src_stride, - Rows src_rows, - Columns dst, size_t dst_width, - Columns mapx, - Columns mapy, const size_t kStep, - ScalarType border_value) { +template +void remap_f32_nearest(uint32x4_t v_xmax, uint32x4_t v_ymax, + uint32x4_t v_src_stride, Rows src_rows, + Columns dst, size_t dst_width, + Columns mapx, Columns mapy, + const size_t kStep, const ScalarType *border_values) { LoopUnroll2 loop{dst_width, kStep}; loop.unroll_once([&](size_t x) { - transform_pixels_constant( + transform_pixels( vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride, - src_rows, dst.at(x), border_value); + src_rows, dst.at(x), border_values); }); } -template +template void remap_f32_linear(uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride, Rows src_rows, Columns dst, size_t dst_width, Columns mapx, Columns mapy, - const size_t kStep, ScalarType border_value) { + const size_t kStep, const ScalarType *border_values) { auto load_xy = [&](size_t x) { return FloatVectorPair{vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x])}; }; - auto vector_path = [&](size_t x) { + auto vector_path_1ch = [&](size_t x) { float32x4_t a, b, c, d, xfrac, yfrac; if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { load_quad_pixels_replicate( @@ -61,62 +46,113 @@ void remap_f32_linear(uint32x4_t v_xmax, uint32x4_t v_ymax, } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); load_quad_pixels_constant( - load_xy(x), v_xmax, v_ymax, v_src_stride, border_value, src_rows, + load_xy(x), v_xmax, v_ymax, v_src_stride, border_values, src_rows, xfrac, yfrac, a, b, c, d); } return lerp_2d(xfrac, yfrac, a, b, c, d); }; + auto vector_path_2ch = [&](size_t x) { + float32x4x2_t a, b, c, d; + float32x4_t xfrac, yfrac; + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + load_quad_pixels_replicate_2ch( + load_xy(x), v_xmax, v_ymax, v_src_stride, src_rows, xfrac, yfrac, a, + b, c, d); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + load_quad_pixels_constant_2ch( + load_xy(x), v_xmax, v_ymax, v_src_stride, border_values, src_rows, + xfrac, yfrac, a, b, c, d); + } + float32x4_t xfrac_low = vzip1q_f32(xfrac, xfrac); + float32x4_t yfrac_low = vzip1q_f32(yfrac, yfrac); + float32x4_t xfrac_high = vzip2q_f32(xfrac, xfrac); + float32x4_t yfrac_high = vzip2q_f32(yfrac, yfrac); + + uint32x4_t result_low = + lerp_2d(xfrac_low, yfrac_low, a.val[0], b.val[0], c.val[0], d.val[0]); + uint32x4_t result_high = + lerp_2d(xfrac_high, yfrac_high, a.val[1], b.val[1], c.val[1], d.val[1]); + return vuzp1q_u16(result_low, result_high); + }; + LoopUnroll2 loop{dst_width, kStep}; - if constexpr (std::is_same::value) { - loop.unroll_four_times([&](size_t x) { - ScalarType *p_dst = &dst[x]; - uint32x4_t res0 = vector_path(x); - x += kStep; - uint32x4_t res1 = vector_path(x); - uint16x8_t result16_0 = vuzp1q_u16(res0, res1); - - x += kStep; - res0 = vector_path(x); - x += kStep; - res1 = vector_path(x); - uint16x8_t result16_1 = vuzp1q_u16(res0, res1); - - vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1)); - }); - loop.unroll_once([&](size_t x) { - uint32x4_t result = vector_path(x); - dst[x] = vgetq_lane_u32(result, 0); - dst[x + 1] = vgetq_lane_u32(result, 1); - dst[x + 2] = vgetq_lane_u32(result, 2); - dst[x + 3] = vgetq_lane_u32(result, 3); - }); - } else if constexpr (std::is_same::value) { - loop.unroll_twice([&](size_t x) { - ScalarType *p_dst = &dst[x]; - uint32x4_t res0 = vector_path(x); - x += kStep; - uint32x4_t res1 = vector_path(x); - vst1q_u16(p_dst, vuzp1q_u16(res0, res1)); - }); - loop.unroll_once([&](size_t x) { - uint32x4_t result = vector_path(x); - uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); - vst1_u16(&dst[x], result16); - }); + if constexpr (Channels == 1) { + if constexpr (std::is_same::value) { + loop.unroll_four_times([&](size_t x) { + ScalarType *p_dst = &dst[x]; + uint32x4_t res0 = vector_path_1ch(x); + x += kStep; + uint32x4_t res1 = vector_path_1ch(x); + uint16x8_t result16_0 = vuzp1q_u16(res0, res1); + + x += kStep; + res0 = vector_path_1ch(x); + x += kStep; + res1 = vector_path_1ch(x); + uint16x8_t result16_1 = vuzp1q_u16(res0, res1); + + vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1)); + }); + loop.unroll_once([&](size_t x) { + uint8x16_t result = vreinterpretq_u8_u32(vector_path_1ch(x)); + dst[x] = vgetq_lane_u8(result, 0); + dst[x + 1] = vgetq_lane_u8(result, 4); + dst[x + 2] = vgetq_lane_u8(result, 8); + dst[x + 3] = vgetq_lane_u8(result, 12); + }); + } + if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType *p_dst = dst.ptr_at(x); + uint32x4_t res0 = vector_path_1ch(x); + x += kStep; + uint32x4_t res1 = vector_path_1ch(x); + vst1q_u16(p_dst, vuzp1q_u16(res0, res1)); + }); + loop.unroll_once([&](size_t x) { + uint32x4_t result = vector_path_1ch(x); + uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); + vst1_u16(dst.ptr_at(x), result16); + }); + } + } + if constexpr (Channels == 2) { + if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType *p_dst = dst.ptr_at(x); + uint16x8_t result16_0 = vector_path_2ch(x); + x += kStep; + uint16x8_t result16_1 = vector_path_2ch(x); + vst1q_u8(p_dst, vuzp1q_u8(vreinterpretq_u8_u16(result16_0), + vreinterpretq_u8_u16(result16_1))); + }); + loop.unroll_once([&](size_t x) { + uint16x8_t result = vector_path_2ch(x); + vst1_u8(dst.ptr_at(x), vmovn_u16(result)); + }); + } + if constexpr (std::is_same::value) { + loop.unroll_once([&](size_t x) { + uint16x8_t result = vector_path_2ch(x); + vst1q_u16(dst.ptr_at(x), result); + }); + } } } template + kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border, + size_t Channels> void transform_operation(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType *border_value, + size_t src_height, const ScalarType *border_values, Rows dst_rows, size_t dst_width, size_t y_begin, size_t y_end, Rows mapx_rows, Rows mapy_rows) { - uint32x4_t v_src_stride = vdupq_n_u32( + uint32x4_t v_src_element_stride = vdupq_n_u32( static_cast(src_rows.stride() / sizeof(ScalarType))); uint32x4_t v_xmax = vdupq_n_u32(static_cast(src_width - 1)); uint32x4_t v_ymax = vdupq_n_u32(static_cast(src_height - 1)); @@ -124,23 +160,16 @@ void transform_operation(Rows src_rows, size_t src_width, for (size_t y = y_begin; y < y_end; ++y) { if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { - if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - remap_f32_nearest_replicate( - v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(), - dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep); - } else { - static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); - remap_f32_nearest_constant( - v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(), - dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep, - border_value[0]); - } + remap_f32_nearest( + v_xmax, v_ymax, v_src_element_stride, src_rows, dst_rows.as_columns(), + dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep, + border_values); } else { static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR); - remap_f32_linear( - v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(), + remap_f32_linear( + v_xmax, v_ymax, v_src_element_stride, src_rows, dst_rows.as_columns(), dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep, - Border == KLEIDICV_BORDER_TYPE_CONSTANT ? border_value[0] : 0); + border_values); } ++mapx_rows; ++mapy_rows; @@ -170,18 +199,11 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, } if (!remap_f32_is_implemented(src_stride, src_width, src_height, dst_width, - border_type, channels, interpolation)) { + dst_height, border_type, channels, + interpolation)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - // Calculating in float32_t will only be precise until 24 bits - if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || - dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || - // Empty source image is not supported - src_width == 0 || src_height == 0) { - return KLEIDICV_ERROR_RANGE; - } - Rows src_rows{src, src_stride, channels}; Rows mapx_rows{mapx, mapx_stride, 1}; Rows mapy_rows{mapy, mapy_stride, 1}; @@ -189,7 +211,7 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, Rectangle rect{dst_width, dst_height}; transform_operation(is_image_large(src_rows, src_height), interpolation, - border_type, src_rows, src_width, src_height, + border_type, channels, src_rows, src_width, src_height, border_value, dst_rows, dst_width, 0, dst_height, mapx_rows, mapy_rows); diff --git a/kleidicv/src/transform/remap_f32_sve2.cpp b/kleidicv/src/transform/remap_f32_sve2.cpp index e4ad3e724..ce05c2170 100644 --- a/kleidicv/src/transform/remap_f32_sve2.cpp +++ b/kleidicv/src/transform/remap_f32_sve2.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include #include @@ -15,33 +13,44 @@ namespace kleidicv::sve2 { -template +template void remap_f32_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax, svuint32_t sv_src_stride, Rows src_rows, svuint32_t sv_border, Columns dst, size_t kStep, size_t dst_width, - Rows mapx_rows, - Rows mapy_rows) { + Rows mapx_rows, Rows mapy_rows, + [[maybe_unused]] svuint8_t load_table_2ch) { svbool_t pg_all32 = svptrue_b32(); + svbool_t pg_all16 = svptrue_b16(); auto load_coords = [&](svbool_t pg, size_t xs) { auto x = static_cast(xs); return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]), svld1_f32(pg, &mapy_rows.as_columns()[x])); }; + auto load_source = [&](svbool_t pg, svuint32_t x, svuint32_t y) { + if constexpr (Channels == 1) { + return load_xy(pg, x, y, sv_src_stride, src_rows); + } + if constexpr (Channels == 2) { + return load_xy_2ch(pg, x, y, sv_src_stride, src_rows, + load_table_2ch); + } + }; + auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) { svuint32_t x = svget2(coords, 0); svuint32_t y = svget2(coords, 1); if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); - svuint32_t result = - load_xy(in_range, x, y, sv_src_stride, src_rows); + svuint32_t result = load_source(in_range, x, y); // Select between source pixels and border colour return svsel_u32(in_range, result, sv_border); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE); - return load_xy(pg, x, y, sv_src_stride, src_rows); + return load_source(pg, x, y); } }; @@ -76,77 +85,131 @@ void remap_f32_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax, LoopUnroll2 loop{dst_width, kStep}; - if constexpr (std::is_same::value) { - auto vector_path_generic = [&](size_t x, size_t x_max, - Columns dst) { - size_t length = x_max - x; - svbool_t pg32 = svwhilelt_b32(0ULL, length); - svuint32_t result = - get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); - svst1b_u32(pg32, &dst[static_cast(x)], result); - }; - - loop.unroll_four_times([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res32_0 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - svuint32_t res32_1 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - x += kStep; - res32_0 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - res32_1 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0), - svreinterpret_u8_u16(result1)); - svst1(svptrue_b8(), p_dst, result); - }); - loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); }); - loop.remaining( - [&](size_t x, size_t length) { vector_path_generic(x, length, dst); }); + if constexpr (Channels == 1) { + if constexpr (std::is_same::value) { + auto vector_path_generic = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32_t result = + get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); + svst1b_u32(pg32, &dst[static_cast(x)], result); + }; + + loop.unroll_four_times([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res32_0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + svuint32_t res32_1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + x += kStep; + res32_0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + res32_1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0), + svreinterpret_u8_u16(result1)); + svst1(svptrue_b8(), p_dst, result); + }); + loop.unroll_once( + [&](size_t x) { vector_path_generic(x, x + kStep, dst); }); + loop.remaining([&](size_t x, size_t length) { + vector_path_generic(x, length, dst); + }); + } + + if constexpr (std::is_same::value) { + auto vector_path_generic = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32_t result = + get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); + svst1h_u32(pg32, &dst[static_cast(x)], result); + }; + + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res32_0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + svuint32_t res32_1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + svst1(svptrue_b16(), p_dst, result); + }); + loop.unroll_once( + [&](size_t x) { vector_path_generic(x, x + kStep, dst); }); + loop.remaining([&](size_t x, size_t length) { + vector_path_generic(x, length, dst); + }); + } } - if constexpr (std::is_same::value) { - auto vector_path_generic = [&](size_t x, size_t x_max, - Columns dst) { - size_t length = x_max - x; - svbool_t pg32 = svwhilelt_b32(0ULL, length); - svuint32_t result = - get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); - svst1h_u32(pg32, &dst[static_cast(x)], result); - }; - - loop.unroll_twice([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res32_0 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - svuint32_t res32_1 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svst1(svptrue_b16(), p_dst, result); - }); - loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); }); - loop.remaining( - [&](size_t x, size_t length) { vector_path_generic(x, length, dst); }); + if constexpr (Channels == 2) { + if constexpr (std::is_same::value) { + auto vector_path_generic = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32_t result = + get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); + svbool_t pg16 = svwhilelt_b16(0ULL, 2 * length); + svst1b_u16(pg16, dst.ptr_at(static_cast(x)), + svreinterpret_u16_u32(result)); + }; + + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = dst.ptr_at(static_cast(x)); + svuint32_t result0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + svuint32_t result1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint8_t result = svuzp1_u8(svreinterpret_u8_u32(result0), + svreinterpret_u8_u32(result1)); + svst1(svptrue_b8(), p_dst, result); + }); + loop.unroll_once( + [&](size_t x) { vector_path_generic(x, x + kStep, dst); }); + loop.remaining([&](size_t x, size_t length) { + vector_path_generic(x, length, dst); + }); + } + + if constexpr (std::is_same::value) { + loop.unroll_once([&](size_t x) { + svuint16_t result = svreinterpret_u16_u32( + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x))); + svst1_u16(pg_all16, dst.ptr_at(static_cast(x)), result); + }); + loop.remaining([&](size_t x, size_t x_max) { + svbool_t pg32 = svwhilelt_b32(x, x_max); + svuint16_t result = svreinterpret_u16_u32( + get_pixels(pg32, calculate_nearest_coordinates(pg32, x))); + svbool_t pg16 = svwhilelt_b16(2 * x, 2 * x_max); + svst1_u16(pg16, dst.ptr_at(static_cast(x)), result); + }); + } } } -template +template void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax, svfloat32_t sv_xmaxf, svfloat32_t sv_ymaxf, svuint32_t sv_src_stride, Rows src_rows, svuint32_t sv_border, Columns dst, size_t kStep, size_t dst_width, - Rows mapx_rows, - Rows mapy_rows) { + Rows mapx_rows, Rows mapy_rows, + svuint8_t load_table_2ch) { auto load_coords = [&](svbool_t pg, size_t xs) { auto x = static_cast(xs); return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]), @@ -156,13 +219,15 @@ void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax, auto calculate_linear = [&](svbool_t pg, uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { svfloat32x2_t coords = load_coords(pg, x); - return calculate_linear_replicated_border( - pg, coords, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows); + return calculate_linear_replicated_border( + pg, coords, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows, + load_table_2ch); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); svfloat32x2_t coords = load_coords(pg, x); - return calculate_linear_constant_border( - pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); + return calculate_linear_constant_border( + pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows, + load_table_2ch); } }; @@ -177,48 +242,94 @@ void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax, svbool_t pg_all32 = svptrue_b32(); LoopUnroll2 loop{dst_width, kStep}; - if constexpr (std::is_same::value) { - loop.unroll_four_times([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16_0 = - svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); - x += kStep; - res0 = calculate_linear(pg_all32, x); - x += kStep; - res1 = calculate_linear(pg_all32, x); - svuint16_t result16_1 = - svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); - svst1_u8(svptrue_b8(), p_dst, - svuzp1_u8(svreinterpret_u8_u16(result16_0), - svreinterpret_u8_u16(result16_1))); + if constexpr (Channels == 1) { + if constexpr (std::is_same::value) { + loop.unroll_four_times([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + x += kStep; + res0 = calculate_linear(pg_all32, x); + x += kStep; + res1 = calculate_linear(pg_all32, x); + svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + svst1_u8(svptrue_b8(), p_dst, + svuzp1_u8(svreinterpret_u8_u16(result16_0), + svreinterpret_u8_u16(result16_1))); + }); + } + if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + svst1_u16(svptrue_b16(), p_dst, result16); + }); + } + loop.unroll_once([&](size_t x) { + svuint32_t result = calculate_linear(pg_all32, x); + store_vector(pg_all32, &dst[static_cast(x)], result); }); - } else if constexpr (std::is_same::value) { - loop.unroll_twice([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16 = - svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); - svst1_u16(svptrue_b16(), p_dst, result16); + loop.remaining([&](size_t x, size_t x_max) { + svbool_t pg32 = svwhilelt_b32(x, x_max); + svuint32_t result = calculate_linear(pg32, x); + store_vector(pg32, &dst[static_cast(x)], result); }); } - loop.unroll_once([&](size_t x) { - svuint32_t result = calculate_linear(pg_all32, x); - store_vector(pg_all32, &dst[static_cast(x)], result); - }); - loop.remaining([&](size_t x, size_t x_max) { - svbool_t pg32 = svwhilelt_b32(x, x_max); - svuint32_t result = calculate_linear(pg32, x); - store_vector(pg32, &dst[static_cast(x)], result); - }); + + if constexpr (Channels == 2) { + if constexpr (std::is_same::value) { + auto vector_path_generic = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32_t result = calculate_linear(pg32, x); + svbool_t pg16 = svwhilelt_b16(0ULL, 2 * length); + svst1b_u16(pg16, dst.ptr_at(static_cast(x)), + svreinterpret_u16_u32(result)); + }; + + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = dst.ptr_at(static_cast(x)); + svuint32_t result0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t result1 = calculate_linear(pg_all32, x); + svuint8_t result = svuzp1_u8(svreinterpret_u8_u32(result0), + svreinterpret_u8_u32(result1)); + svst1(svptrue_b8(), p_dst, result); + }); + loop.unroll_once( + [&](size_t x) { vector_path_generic(x, x + kStep, dst); }); + loop.remaining([&](size_t x, size_t length) { + vector_path_generic(x, length, dst); + }); + } + if constexpr (std::is_same::value) { + loop.unroll_once([&](size_t x) { + svuint16_t result = + svreinterpret_u16_u32(calculate_linear(pg_all32, x)); + svst1_u16(svptrue_b16(), dst.ptr_at(static_cast(x)), result); + }); + loop.remaining([&](size_t x, size_t x_max) { + svbool_t pg32 = svwhilelt_b32(x, x_max); + svuint16_t result = svreinterpret_u16_u32(calculate_linear(pg32, x)); + svbool_t pg16 = svwhilelt_b16(2 * x, 2 * x_max); + svst1_u16(pg16, dst.ptr_at(static_cast(x)), result); + }); + } + } } template + kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border, + size_t Channels> void transform_operation(Rows src_rows, size_t src_width, size_t src_height, const ScalarType* border_value, Rows dst_rows, size_t dst_width, @@ -231,7 +342,14 @@ void transform_operation(Rows src_rows, size_t src_width, svuint32_t sv_border = svdup_n_u32(0); if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - sv_border = svdup_n_u32(border_value[0]); + if constexpr (Channels == 1) { + sv_border = svdup_n_u32(border_value[0]); + } + if constexpr (Channels == 2) { + uint32_t v = static_cast(border_value[0]) | + (static_cast(border_value[1]) << 16); + sv_border = svdup_n_u32(v); + } } svfloat32_t sv_xmaxf = svdup_n_f32(static_cast(src_width - 1)); @@ -239,17 +357,25 @@ void transform_operation(Rows src_rows, size_t src_width, const size_t kStep = VecTraits::num_lanes(); + // Rearrange input for 8bit 2channel: + // Gather Load 16bits, 2x 8bits for 2 channels: + // after 32-bit gather load: ..DC..BA + // goal is to have 16-bit elements: .D.C.B.A + svuint8_t load_table_2ch = + svreinterpret_u8_u32(svindex_u32(0x03010200U, 0x04040404)); + for (size_t y = y_begin; y < y_end; ++y) { Columns dst = dst_rows.as_columns(); if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { - remap_f32_nearest( + remap_f32_nearest( sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep, - dst_width, mapx_rows, mapy_rows); + dst_width, mapx_rows, mapy_rows, load_table_2ch); } else { static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR); - remap_f32_linear( + remap_f32_linear( sv_xmax, sv_ymax, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows, - sv_border, dst, kStep, dst_width, mapx_rows, mapy_rows); + sv_border, dst, kStep, dst_width, mapx_rows, mapy_rows, + load_table_2ch); } ++mapx_rows; ++mapy_rows; @@ -279,18 +405,11 @@ kleidicv_error_t remap_f32(const T* src, size_t src_stride, size_t src_width, } if (!remap_f32_is_implemented(src_stride, src_width, src_height, dst_width, - border_type, channels, interpolation)) { + dst_height, border_type, channels, + interpolation)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } - // Calculating in float32_t will only be precise until 24 bits - if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || - dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || - // Empty source image is not supported - src_width == 0 || src_height == 0) { - return KLEIDICV_ERROR_RANGE; - } - Rows src_rows{src, src_stride, channels}; Rows mapx_rows{mapx, mapx_stride, 1}; Rows mapy_rows{mapy, mapy_stride, 1}; @@ -298,7 +417,7 @@ kleidicv_error_t remap_f32(const T* src, size_t src_stride, size_t src_width, Rectangle rect{dst_width, dst_height}; transform_operation(is_image_large(src_rows, src_height), interpolation, - border_type, src_rows, src_width, src_height, + border_type, channels, src_rows, src_width, src_height, border_value, dst_rows, dst_width, 0, dst_height, mapx_rows, mapy_rows); diff --git a/kleidicv/src/transform/transform_common.h b/kleidicv/src/transform/transform_common.h index eaf005f66..da5bbeeef 100644 --- a/kleidicv/src/transform/transform_common.h +++ b/kleidicv/src/transform/transform_common.h @@ -13,6 +13,24 @@ bool is_image_large(const Rows &rows, size_t height) { return rows.stride() * height >= 1ULL << 32; } +// Convert channels to a template argument. +template +void transform_operation(size_t channels, Args &&...args) { + switch (channels) { + case 1: + transform_operation( + std::forward(args)...); + break; + case 2: + transform_operation( + std::forward(args)...); + default: + return; + } +} + // Convert border_type to a template argument. template diff --git a/kleidicv/src/transform/transform_neon.h b/kleidicv/src/transform/transform_neon.h index afcc72da8..ab0f5b2ca 100644 --- a/kleidicv/src/transform/transform_neon.h +++ b/kleidicv/src/transform/transform_neon.h @@ -2,8 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - +#include "kleidicv/ctypes.h" #include "kleidicv/neon.h" #include "kleidicv/types.h" #include "transform_common.h" @@ -18,33 +17,70 @@ template float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride, Rows& src_rows) { if constexpr (IsLarge) { - uint64x2_t offset_low = + uint64x2_t indices_low = vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), vget_low_u32(v_src_stride)); - uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), - vget_low_u32(v_src_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); uint64_t acc = - static_cast(src_rows[vgetq_lane_u64(offset_low, 0)]) | - (static_cast(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32); + static_cast(src_rows[vgetq_lane_u64(indices_low, 0)]) | + (static_cast(src_rows[vgetq_lane_u64(indices_low, 1)]) << 32); uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = - static_cast(src_rows[vgetq_lane_u64(offset_high, 0)]) | - (static_cast(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32); + acc = static_cast(src_rows[vgetq_lane_u64(indices_high, 0)]) | + (static_cast(src_rows[vgetq_lane_u64(indices_high, 1)]) + << 32); rawsrc = vsetq_lane_u64(acc, rawsrc, 1); return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); } else { - uint32x4_t offset = vmlaq_u32(x, y, v_src_stride); + uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); uint64_t acc = - static_cast(src_rows[vgetq_lane_u32(offset, 0)]) | - (static_cast(src_rows[vgetq_lane_u32(offset, 1)]) << 32); + static_cast(src_rows[vgetq_lane_u32(indices, 0)]) | + (static_cast(src_rows[vgetq_lane_u32(indices, 1)]) << 32); uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = static_cast(src_rows[vgetq_lane_u32(offset, 2)]) | - (static_cast(src_rows[vgetq_lane_u32(offset, 3)]) << 32); + acc = static_cast(src_rows[vgetq_lane_u32(indices, 2)]) | + (static_cast(src_rows[vgetq_lane_u32(indices, 3)]) << 32); rawsrc = vsetq_lane_u64(acc, rawsrc, 1); return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); } } +template +float32x4x2_t inline load_xy_2ch(uint32x4_t x, uint32x4_t y, + uint32x4_t v_src_stride, + Rows& src_rows) { + const size_t kBytes = 2 * sizeof(ScalarType); + ScalarType elements[4 * 2]; // 4 pixels, 2 channels + // Multiply x with the number of channels (2) + x = vshlq_n_u32(x, 1); + if constexpr (IsLarge) { + uint64x2_t indices_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); + memcpy(&elements[0], &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes); + memcpy(&elements[2], &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes); + memcpy(&elements[4], &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes); + memcpy(&elements[6], &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes); + } else { + uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); + memcpy(&elements[0], &src_rows[vgetq_lane_u32(indices, 0)], kBytes); + memcpy(&elements[2], &src_rows[vgetq_lane_u32(indices, 1)], kBytes); + memcpy(&elements[4], &src_rows[vgetq_lane_u32(indices, 2)], kBytes); + memcpy(&elements[6], &src_rows[vgetq_lane_u32(indices, 3)], kBytes); + } + uint16x8_t pixels16{}; + if constexpr (std::is_same::value) { + pixels16 = vmovl_u8(vld1_u8(elements)); + } else if constexpr (std::is_same::value) { + pixels16 = vld1q_u16(elements); + } + float32x4x2_t result; + result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16))); + result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16)); + return result; +} + template float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y, uint32x4_t in_range, @@ -52,40 +88,40 @@ float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride, Rows src_rows) { if constexpr (IsLarge) { - uint64x2_t offset_low = + uint64x2_t indices_low = vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), vget_low_u32(v_src_stride)); - uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), - vget_low_u32(v_src_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); uint64_t pixel0 = vgetq_lane_u32(in_range, 0) - ? src_rows[vgetq_lane_u64(offset_low, 0)] + ? src_rows[vgetq_lane_u64(indices_low, 0)] : border_value; uint64_t pixel1 = vgetq_lane_u32(in_range, 1) - ? src_rows[vgetq_lane_u64(offset_low, 1)] + ? src_rows[vgetq_lane_u64(indices_low, 1)] : border_value; uint64_t pixel2 = vgetq_lane_u32(in_range, 2) - ? src_rows[vgetq_lane_u64(offset_high, 0)] + ? src_rows[vgetq_lane_u64(indices_high, 0)] : border_value; uint64_t pixel3 = vgetq_lane_u32(in_range, 3) - ? src_rows[vgetq_lane_u64(offset_high, 1)] + ? src_rows[vgetq_lane_u64(indices_high, 1)] : border_value; uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); } else { - uint32x4_t offset = vmlaq_u32(x, y, v_src_stride); + uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); uint64_t pixel0 = vgetq_lane_u32(in_range, 0) - ? src_rows[vgetq_lane_u32(offset, 0)] + ? src_rows[vgetq_lane_u32(indices, 0)] : border_value; uint64_t pixel1 = vgetq_lane_u32(in_range, 1) - ? src_rows[vgetq_lane_u32(offset, 1)] + ? src_rows[vgetq_lane_u32(indices, 1)] : border_value; uint64_t pixel2 = vgetq_lane_u32(in_range, 2) - ? src_rows[vgetq_lane_u32(offset, 2)] + ? src_rows[vgetq_lane_u32(indices, 2)] : border_value; uint64_t pixel3 = vgetq_lane_u32(in_range, 3) - ? src_rows[vgetq_lane_u32(offset, 3)] + ? src_rows[vgetq_lane_u32(indices, 3)] : border_value; uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); @@ -94,6 +130,62 @@ float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y, } } +template +float32x4x2_t inline load_xy_or_border_2ch(uint32x4_t x, uint32x4_t y, + uint32x4_t in_range, + const ScalarType* border_values, + uint32x4_t v_src_stride, + Rows src_rows) { + const size_t kBytes = 2 * sizeof(ScalarType); + const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{}; + ScalarType elements[4 * 2]; // 4 pixels, 2 channels + // Multiply x with the number of channels + x = vshlq_n_u32(x, 1); + if constexpr (IsLarge) { + uint64x2_t indices_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); + pixel0 = vgetq_lane_u32(in_range, 0) + ? &src_rows[vgetq_lane_u64(indices_low, 0)] + : border_values; + pixel1 = vgetq_lane_u32(in_range, 1) + ? &src_rows[vgetq_lane_u64(indices_low, 1)] + : border_values; + pixel2 = vgetq_lane_u32(in_range, 2) + ? &src_rows[vgetq_lane_u64(indices_high, 0)] + : border_values; + pixel3 = vgetq_lane_u32(in_range, 3) + ? &src_rows[vgetq_lane_u64(indices_high, 1)] + : border_values; + } else { + uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); + pixel0 = vgetq_lane_u32(in_range, 0) ? &src_rows[vgetq_lane_u32(indices, 0)] + : border_values; + pixel1 = vgetq_lane_u32(in_range, 1) ? &src_rows[vgetq_lane_u32(indices, 1)] + : border_values; + pixel2 = vgetq_lane_u32(in_range, 2) ? &src_rows[vgetq_lane_u32(indices, 2)] + : border_values; + pixel3 = vgetq_lane_u32(in_range, 3) ? &src_rows[vgetq_lane_u32(indices, 3)] + : border_values; + } + memcpy(&elements[0], pixel0, kBytes); + memcpy(&elements[2], pixel1, kBytes); + memcpy(&elements[4], pixel2, kBytes); + memcpy(&elements[6], pixel3, kBytes); + uint16x8_t pixels16{}; + if constexpr (std::is_same::value) { + pixels16 = vmovl_u8(vld1_u8(elements)); + } else if constexpr (std::is_same::value) { + pixels16 = vld1q_u16(elements); + } + float32x4x2_t result; + result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16))); + result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16)); + return result; +} + template void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride, @@ -124,10 +216,40 @@ void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax, d = load_xy(x1, y1, v_src_stride, src_rows); } +template +void load_quad_pixels_replicate_2ch(FloatVectorPair xy, uint32x4_t v_xmax, + uint32x4_t v_ymax, uint32x4_t v_src_stride, + Rows src_rows, + float32x4_t& xfrac, float32x4_t& yfrac, + float32x4x2_t& a, float32x4x2_t& b, + float32x4x2_t& c, float32x4x2_t& d) { + auto&& [xf, yf] = xy; + // Truncating convert to int + uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); + uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); + + // Get fractional part, or 0 if out of range + float32x4_t zero = vdupq_n_f32(0.F); + uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); + uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); + xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + + // x1 = x0 + 1, except if it's already xmax or out of range + uint32x4_t x1 = vsubq_u32(x0, x_in_range); + uint32x4_t y1 = vsubq_u32(y0, y_in_range); + + // a: top left, b: top right, c: bottom left, d: bottom right + a = load_xy_2ch(x0, y0, v_src_stride, src_rows); + b = load_xy_2ch(x1, y0, v_src_stride, src_rows); + c = load_xy_2ch(x0, y1, v_src_stride, src_rows); + d = load_xy_2ch(x1, y1, v_src_stride, src_rows); +} + template void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride, - ScalarType border_value, + const ScalarType* border_values, Rows src_rows, float32x4_t& xfrac, float32x4_t& yfrac, float32x4_t& a, float32x4_t& b, float32x4_t& c, @@ -155,14 +277,55 @@ void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax, c_in_range = vandq(x0_in_range, y1_in_range); d_in_range = vandq(x1_in_range, y1_in_range); } - a = load_xy_or_border(x0, y0, a_in_range, border_value, - v_src_stride, src_rows); - b = load_xy_or_border(x1, y0, b_in_range, border_value, - v_src_stride, src_rows); - c = load_xy_or_border(x0, y1, c_in_range, border_value, - v_src_stride, src_rows); - d = load_xy_or_border(x1, y1, d_in_range, border_value, - v_src_stride, src_rows); + a = load_xy_or_border( + x0, y0, a_in_range, border_values[0], v_src_stride, src_rows); + b = load_xy_or_border( + x1, y0, b_in_range, border_values[0], v_src_stride, src_rows); + c = load_xy_or_border( + x0, y1, c_in_range, border_values[0], v_src_stride, src_rows); + d = load_xy_or_border( + x1, y1, d_in_range, border_values[0], v_src_stride, src_rows); +} + +template +void load_quad_pixels_constant_2ch(FloatVectorPair xy, uint32x4_t v_xmax, + uint32x4_t v_ymax, uint32x4_t v_src_stride, + const ScalarType* border_values, + Rows src_rows, + float32x4_t& xfrac, float32x4_t& yfrac, + float32x4x2_t& a, float32x4x2_t& b, + float32x4x2_t& c, float32x4x2_t& d) { + auto&& [xf, yf] = xy; + // Convert coordinates to integers, truncating towards minus infinity. + // Negative numbers will become large positive numbers. + // Since the source width and height is known to be <=2^24 these large + // positive numbers will always be treated as outside the source image + // bounds. + uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf)); + uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf)); + uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1)); + uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1)); + xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range; + { + uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax); + uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax); + uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax); + uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax); + a_in_range = vandq(x0_in_range, y0_in_range); + b_in_range = vandq(x1_in_range, y0_in_range); + c_in_range = vandq(x0_in_range, y1_in_range); + d_in_range = vandq(x1_in_range, y1_in_range); + } + a = load_xy_or_border_2ch( + x0, y0, a_in_range, border_values, v_src_stride, src_rows); + b = load_xy_or_border_2ch( + x1, y0, b_in_range, border_values, v_src_stride, src_rows); + c = load_xy_or_border_2ch( + x0, y1, c_in_range, border_values, v_src_stride, src_rows); + d = load_xy_or_border_2ch( + x1, y1, d_in_range, border_values, v_src_stride, src_rows); } inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a, @@ -173,10 +336,10 @@ inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a, return vcvtaq_u32_f32(result); } -template +template void transform_pixels_replicate(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax, uint32x4_t v_ymax, - uint32x4_t v_src_stride, + uint32x4_t v_src_element_stride, Rows src_rows, Columns dst) { // Round to nearest, with Ties To Away (i.e. round 0.5 up) @@ -184,34 +347,71 @@ void transform_pixels_replicate(float32x4_t xf, float32x4_t yf, // (vcvtaq already converted negative values to 0) uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax); uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax); - + if constexpr (Channels == 2) { + // Multiply x with the number of channels + x = vshlq_n_u32(x, 1); + } // Copy pixels from source if constexpr (IsLarge) { uint64x2_t indices_low = vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), - vget_low_u32(v_src_stride)); + vget_low_u32(v_src_element_stride)); uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), - vget_low_u32(v_src_stride)); - dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)]; - dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)]; - dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)]; - dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)]; + vget_low_u32(v_src_element_stride)); + if constexpr (Channels == 1) { + dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)]; + dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)]; + dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)]; + dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)]; + } else { + const size_t kBytes = Channels * sizeof(ScalarType); + memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes); + memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes); + memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes); + memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes); + } } else { - uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); - dst[0] = src_rows[vgetq_lane_u32(indices, 0)]; - dst[1] = src_rows[vgetq_lane_u32(indices, 1)]; - dst[2] = src_rows[vgetq_lane_u32(indices, 2)]; - dst[3] = src_rows[vgetq_lane_u32(indices, 3)]; + uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); + if constexpr (Channels == 1) { + dst[0] = src_rows[vgetq_lane_u32(indices, 0)]; + dst[1] = src_rows[vgetq_lane_u32(indices, 1)]; + dst[2] = src_rows[vgetq_lane_u32(indices, 2)]; + dst[3] = src_rows[vgetq_lane_u32(indices, 3)]; + } else { + const size_t kBytes = Channels * sizeof(ScalarType); + memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u32(indices, 0)], kBytes); + memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u32(indices, 1)], kBytes); + memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u32(indices, 2)], kBytes); + memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u32(indices, 3)], kBytes); + } } } -template +template +static const ScalarType* get_src_or_border_small( + uint32x4_t in_range, Rows src_rows, uint32x4_t indices, + const ScalarType* border_values) { + return vgetq_lane_u32(in_range, Lane) + ? &src_rows[vgetq_lane_u32(indices, Lane)] + : border_values; +} + +template +static const ScalarType* get_src_or_border_large( + uint32x4_t in_range, Rows src_rows, uint64x2_t indices, + const ScalarType* border_values) { + return vgetq_lane_u32(in_range, Lane) + ? &src_rows[vgetq_lane_u64(indices, Lane % 2)] + : border_values; +} + +template void transform_pixels_constant(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax, uint32x4_t v_ymax, - uint32x4_t v_src_stride, + uint32x4_t v_src_element_stride, Rows src_rows, Columns dst, - ScalarType border_value) { + const ScalarType* border_values) { // Convert coordinates to integers. // Negative numbers will become large positive numbers. // Since the source width and height is known to be <=2^24 these large @@ -222,34 +422,88 @@ void transform_pixels_constant(float32x4_t xf, float32x4_t yf, uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); // Copy pixels from source - if constexpr (IsLarge) { - uint64x2_t indices_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), - vget_low_u32(v_src_stride)); - uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), - vget_low_u32(v_src_stride)); - dst[0] = vgetq_lane_u32(in_range, 0) - ? src_rows[vgetq_lane_u64(indices_low, 0)] - : border_value; - dst[1] = vgetq_lane_u32(in_range, 1) - ? src_rows[vgetq_lane_u64(indices_low, 1)] - : border_value; - dst[2] = vgetq_lane_u32(in_range, 2) - ? src_rows[vgetq_lane_u64(indices_high, 0)] - : border_value; - dst[3] = vgetq_lane_u32(in_range, 3) - ? src_rows[vgetq_lane_u64(indices_high, 1)] - : border_value; + if constexpr (Channels == 1) { + if constexpr (IsLarge) { + uint64x2_t indices_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_element_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_element_stride)); + dst[0] = *get_src_or_border_large<0>(in_range, src_rows, indices_low, + border_values); + dst[1] = *get_src_or_border_large<1>(in_range, src_rows, indices_low, + border_values); + dst[2] = *get_src_or_border_large<2>(in_range, src_rows, indices_high, + border_values); + dst[3] = *get_src_or_border_large<3>(in_range, src_rows, indices_high, + border_values); + } else { + uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); + dst[0] = *get_src_or_border_small<0>(in_range, src_rows, indices, + border_values); + dst[1] = *get_src_or_border_small<1>(in_range, src_rows, indices, + border_values); + dst[2] = *get_src_or_border_small<2>(in_range, src_rows, indices, + border_values); + dst[3] = *get_src_or_border_small<3>(in_range, src_rows, indices, + border_values); + } + } else { // Channels > 1 + const size_t kBytes = Channels * sizeof(ScalarType); + const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{}; + // Multiply x with the number of channels + if constexpr (Channels == 2) { + x = vshlq_n_u32(x, 1); + } else { + x = vmulq_n_u32(x, Channels); + } + if constexpr (IsLarge) { + uint64x2_t indices_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_element_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_element_stride)); + pixel0 = get_src_or_border_large<0>(in_range, src_rows, indices_low, + border_values); + pixel1 = get_src_or_border_large<1>(in_range, src_rows, indices_low, + border_values); + pixel2 = get_src_or_border_large<2>(in_range, src_rows, indices_high, + border_values); + pixel3 = get_src_or_border_large<3>(in_range, src_rows, indices_high, + border_values); + + } else { + uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); + pixel0 = get_src_or_border_small<0>(in_range, src_rows, indices, + border_values); + pixel1 = get_src_or_border_small<1>(in_range, src_rows, indices, + border_values); + pixel2 = get_src_or_border_small<2>(in_range, src_rows, indices, + border_values); + pixel3 = get_src_or_border_small<3>(in_range, src_rows, indices, + border_values); + } + memcpy(dst.ptr_at(0), pixel0, kBytes); + memcpy(dst.ptr_at(1), pixel1, kBytes); + memcpy(dst.ptr_at(2), pixel2, kBytes); + memcpy(dst.ptr_at(3), pixel3, kBytes); + } +} + +template +void transform_pixels(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax, + uint32x4_t v_ymax, uint32x4_t v_src_element_stride, + Rows src_rows, Columns dst, + [[maybe_unused]] const ScalarType* border_values) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + transform_pixels_replicate( + xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst); } else { - uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); - dst[0] = vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(indices, 0)] - : border_value; - dst[1] = vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(indices, 1)] - : border_value; - dst[2] = vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(indices, 2)] - : border_value; - dst[3] = vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(indices, 3)] - : border_value; + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + transform_pixels_constant( + xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst, + border_values); } } diff --git a/kleidicv/src/transform/transform_sve2.h b/kleidicv/src/transform/transform_sve2.h index 74b6d889a..b9cb5ed46 100644 --- a/kleidicv/src/transform/transform_sve2.h +++ b/kleidicv/src/transform/transform_sve2.h @@ -2,21 +2,12 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - -#include #include #include #include #include -#include -#include -#include -#include "kleidicv/ctypes.h" -#include "kleidicv/kleidicv.h" #include "kleidicv/sve2.h" -#include "kleidicv/traits.h" #include "kleidicv/types.h" #include "transform_common.h" @@ -26,56 +17,115 @@ template svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y, svuint32_t sv_src_stride, Rows &src_rows) { - if constexpr (std::is_same::value) { - if constexpr (IsLarge) { - svbool_t pg_b = pg; - svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + if constexpr (IsLarge) { + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t result_b, result_t; + if constexpr (std::is_same::value) { svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); // Copy pixels from source - svuint64_t result_b = - svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); - svuint64_t result_t = - svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); - return svtrn1_u32(svreinterpret_u32_u64(result_b), - svreinterpret_u32_u64(result_t)); - } else { - svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); - return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); + result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); } - } else if constexpr (std::is_same::value) { - if constexpr (IsLarge) { - svbool_t pg_b = pg; - svbool_t pg_t = svtrn2_b32(pg, svpfalse()); - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits + if constexpr (std::is_same::value) { + // Multiply x with sizeof(uint16_t) svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); // Copy pixels from source - svuint64_t result_b, result_t; result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b); result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t); - return svtrn1_u32(svreinterpret_u32_u64(result_b), - svreinterpret_u32_u64(result_t)); + } + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + } else { + if constexpr (std::is_same::value) { + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); + return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); } else { - svuint32_t offsets = - svmla_x(pg, svlsl_n_u32_x(pg, x, 1), y, sv_src_stride); + // Multiply by sizeof(uint16_t) + x = svlsl_x(pg, x, 1); + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets); } } } template +svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y, + svuint32_t sv_src_stride, + Rows &src_rows, + [[maybe_unused]] svuint8_t load_table) { + if constexpr (IsLarge) { + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + if constexpr (std::is_same::value) { + // Multiply x with the number of channels + svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); + // Copy pixels from source + svuint64_t b = svld1uh_gather_offset_u64( + pg_b, reinterpret_cast(&src_rows[0]), offsets_b); + svuint64_t t = svld1uh_gather_offset_u64( + pg_t, reinterpret_cast(&src_rows[0]), offsets_t); + svuint32_t r32 = + svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t)); + return svreinterpret_u32_u8( + svtbl_u8(svreinterpret_u8_u32(r32), load_table)); + } + if constexpr (std::is_same::value) { + // Multiply x with the number of channels and sizeof(uint16_t) + svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b = svld1uw_gather_offset_u64( + pg_b, reinterpret_cast(&src_rows[0]), offsets_b); + svuint64_t result_t = svld1uw_gather_offset_u64( + pg_t, reinterpret_cast(&src_rows[0]), offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + } + } else { + // Multiply x with the number of channels and sizeof(ScalarType) + // This shifting formula is only correct for 8 and 16 bits + x = svlsl_n_u32_x(pg, x, sizeof(ScalarType)); + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); + if constexpr (std::is_same::value) { + svuint32_t r32 = svld1uh_gather_offset_u32( + pg, reinterpret_cast(&src_rows[0]), offsets); + return svreinterpret_u32_u8( + svtbl_u8(svreinterpret_u8_u32(r32), load_table)); + } + if constexpr (std::is_same::value) { + return svld1_gather_u32offset_u32( + pg, reinterpret_cast(&src_rows[0]), offsets); + } + } +} + +template svuint32_t inline calculate_linear_replicated_border( svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf, - svuint32_t sv_src_stride, Rows &src_rows) { + svuint32_t sv_src_stride, Rows &src_rows, + svuint8_t load_table_2ch) { + svbool_t pg_all32 = svptrue_b32(); + auto load_source = [&](svuint32_t x, svuint32_t y) { - return load_xy(pg, x, y, sv_src_stride, src_rows); + if constexpr (Channels == 1) { + return load_xy(pg, x, y, sv_src_stride, src_rows); + } + if constexpr (Channels == 2) { + return load_xy_2ch(pg, x, y, sv_src_stride, src_rows, + load_table_2ch); + } }; - svbool_t pg_all32 = svptrue_b32(); + svfloat32_t xf = svget2(coords, 0); svfloat32_t yf = svget2(coords, 1); // Take the integer part, clamp it to within the dimensions of the @@ -99,19 +149,43 @@ svuint32_t inline calculate_linear_replicated_border( svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0); svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0); + auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci, + svuint32_t di) { + svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai); + svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi); + svfloat32_t line0 = + svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); + svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci); + svfloat32_t d = svcvt_f32_u32_x(pg_all32, di); + svfloat32_t line1 = + svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); + svfloat32_t result = svmla_f32_x( + pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); + return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); + }; + // Calculate offsets from coordinates (y * stride + x) // a: top left, b: top right, c: bottom left, d: bottom right - svfloat32_t a = svcvt_f32_u32_x(pg_all32, load_source(x0, y0)); - svfloat32_t b = svcvt_f32_u32_x(pg_all32, load_source(x1, y0)); - svfloat32_t line0 = - svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); - svfloat32_t c = svcvt_f32_u32_x(pg_all32, load_source(x0, y1)); - svfloat32_t d = svcvt_f32_u32_x(pg_all32, load_source(x1, y1)); - svfloat32_t line1 = - svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); - svfloat32_t result = - svmla_f32_x(pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); - return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); + svuint32_t a = load_source(x0, y0); + svuint32_t b = load_source(x1, y0); + svuint32_t c = load_source(x0, y1); + svuint32_t d = load_source(x1, y1); + if constexpr (Channels == 1) { + return lerp_2d(a, b, c, d); + } + if constexpr (Channels == 2) { + // Channel 0 + svuint32_t res32_0 = lerp_2d( + svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)), + svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d))); + // Channel 1 + svuint32_t res32_1 = lerp_2d( + svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)), + svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d))); + + return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1))); + } } template @@ -128,10 +202,39 @@ svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y, } template +svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y, + svuint32_t sv_border, svuint32_t sv_xmax, + svuint32_t sv_ymax, + svuint32_t sv_src_stride, + Rows &src_rows, + svuint8_t load_table) { + svbool_t in_range = + svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); + svuint32_t result = load_xy_2ch( + in_range, x, y, sv_src_stride, src_rows, load_table); + // Select between source pixels and border colour + return svsel_u32(in_range, result, sv_border); +} + +template svuint32_t inline calculate_linear_constant_border( svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax, svuint32_t sv_ymax, svuint32_t sv_src_stride, - Rows &src_rows) { + Rows &src_rows, svuint8_t load_table_2ch) { + svbool_t pg_all32 = svptrue_b32(); + + auto load_source = [&](svuint32_t x, svuint32_t y) { + if constexpr (Channels == 1) { + return get_pixels_or_border( + pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); + } + if constexpr (Channels == 2) { + return get_pixels_or_border_2ch( + pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows, + load_table_2ch); + } + }; + // Convert coordinates to integers, truncating towards minus infinity. // Negative numbers will become large positive numbers. // Since the source width and height is known to be <=2^24 these large @@ -153,23 +256,43 @@ svuint32_t inline calculate_linear_constant_border( yfrac = svsub_f32_x(pg, yf, yf0); } - svfloat32_t a = svcvt_f32_u32_x(pg, get_pixels_or_border( - pg, x0, y0, sv_border, sv_xmax, - sv_ymax, sv_src_stride, src_rows)); - svfloat32_t b = svcvt_f32_u32_x(pg, get_pixels_or_border( - pg, x1, y0, sv_border, sv_xmax, - sv_ymax, sv_src_stride, src_rows)); - svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac); - svfloat32_t c = svcvt_f32_u32_x(pg, get_pixels_or_border( - pg, x0, y1, sv_border, sv_xmax, - sv_ymax, sv_src_stride, src_rows)); - svfloat32_t d = svcvt_f32_u32_x(pg, get_pixels_or_border( - pg, x1, y1, sv_border, sv_xmax, - sv_ymax, sv_src_stride, src_rows)); - svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac); - svfloat32_t result = - svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac); - return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result)); + auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci, + svuint32_t di) { + svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai); + svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi); + svfloat32_t line0 = + svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); + svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci); + svfloat32_t d = svcvt_f32_u32_x(pg_all32, di); + svfloat32_t line1 = + svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); + svfloat32_t result = svmla_f32_x( + pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); + return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); + }; + + // Calculate offsets from coordinates (y * stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + svuint32_t a = load_source(x0, y0); + svuint32_t b = load_source(x1, y0); + svuint32_t c = load_source(x0, y1); + svuint32_t d = load_source(x1, y1); + if constexpr (Channels == 1) { + return lerp_2d(a, b, c, d); + } + if constexpr (Channels == 2) { + // Channel 0 + svuint32_t res32_0 = lerp_2d( + svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)), + svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d))); + // Channel 1 + svuint32_t res32_1 = lerp_2d( + svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)), + svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d))); + + return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1))); + } } } // namespace kleidicv::sve2 diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp index dd7494523..e76d14493 100644 --- a/kleidicv/src/transform/warp_perspective_neon.cpp +++ b/kleidicv/src/transform/warp_perspective_neon.cpp @@ -1,9 +1,7 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 -#include - #include #include "kleidicv/ctypes.h" @@ -34,10 +32,11 @@ namespace kleidicv::neon { // template + kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border, + size_t Channels> void transform_operation(Rows src_rows, size_t src_width, size_t src_height, const float transform[9], - const ScalarType *border_value, + const ScalarType *border_values, Rows dst_rows, size_t dst_width, size_t y_begin, size_t y_end) { static constexpr uint32_t first_few_x[] = {0, 1, 2, 3}; @@ -73,8 +72,8 @@ void transform_operation(Rows src_rows, size_t src_width, } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); load_quad_pixels_constant( - calculate_coordinates(x), v_xmax, v_ymax, v_src_stride, - border_value[0], src_rows, xfrac, yfrac, a, b, c, d); + calculate_coordinates(x), v_xmax, v_ymax, v_src_stride, border_values, + src_rows, xfrac, yfrac, a, b, c, d); } return lerp_2d(xfrac, yfrac, a, b, c, d); }; @@ -96,16 +95,16 @@ void transform_operation(Rows src_rows, size_t src_width, if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { loop.unroll_once([&](size_t x) { auto &&[xf, yf] = calculate_coordinates(x); - transform_pixels_replicate( + transform_pixels_replicate( xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x)); }); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); loop.unroll_once([&](size_t x) { auto &&[xf, yf] = calculate_coordinates(x); - transform_pixels_constant( + transform_pixels_constant( xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x), - border_value[0]); + border_values); }); } } else { @@ -168,7 +167,7 @@ kleidicv_error_t warp_perspective_stripe( dst_rows += y_begin; transform_operation(is_image_large(src_rows, src_height), interpolation, - border_type, src_rows, src_width, src_height, + border_type, channels, src_rows, src_width, src_height, transformation, border_value, dst_rows, dst_width, y_begin, y_end); return KLEIDICV_OK; diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp index b1f3df8d2..7b00b8492 100644 --- a/kleidicv/src/transform/warp_perspective_sve2.cpp +++ b/kleidicv/src/transform/warp_perspective_sve2.cpp @@ -33,7 +33,8 @@ namespace kleidicv::sve2 { // template + kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border, + size_t Channels> void transform_operation(Rows src_rows, size_t src_width, size_t src_height, const ScalarType *border_value, Rows dst_rows, size_t dst_width, @@ -160,15 +161,20 @@ void transform_operation(Rows src_rows, size_t src_width, svst1b_u32(pg32, &dst[static_cast(x)], result); }; + // WarpPerspective does not implement 2 channels, so this is dummy + svuint8_t dummy_load_table_2ch{}; + auto calculate_linear = [&](svbool_t pg, uint32_t x) { svfloat32x2_t coords = calc_coords(pg, x); if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - return calculate_linear_replicated_border( - pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); + return calculate_linear_replicated_border( + pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows, + dummy_load_table_2ch); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); - return calculate_linear_constant_border( - pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); + return calculate_linear_constant_border( + pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows, + dummy_load_table_2ch); } }; @@ -265,7 +271,7 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe( dst_rows += y_begin; transform_operation(is_image_large(src_rows, src_height), interpolation, - border_type, src_rows, src_width, src_height, + border_type, channels, src_rows, src_width, src_height, border_value, dst_rows, dst_width, y_begin, y_end, transform); diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index b8368e013..f72010f92 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -765,8 +765,8 @@ kleidicv_error_t kleidicv_thread_remap_f32_u8( kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading mt) { if (!kleidicv::remap_f32_is_implemented( - src_stride, src_width, src_height, dst_width, border_type, channels, - interpolation)) { + src_stride, src_width, src_height, dst_width, dst_height, border_type, + channels, interpolation)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } auto callback = [=](unsigned begin, unsigned end) { @@ -790,8 +790,8 @@ kleidicv_error_t kleidicv_thread_remap_f32_u16( kleidicv_border_type_t border_type, const uint16_t *border_value, kleidicv_thread_multithreading mt) { if (!kleidicv::remap_f32_is_implemented( - src_stride, src_width, src_height, dst_width, border_type, channels, - interpolation)) { + src_stride, src_width, src_height, dst_width, dst_height, border_type, + channels, interpolation)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } auto callback = [=](unsigned begin, unsigned end) { diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 0625b6bd3..ca82aeda0 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -86,14 +86,22 @@ Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8U Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' -Remap_F32_U8_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' -Remap_F32_U8_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' -Remap_F32_U16_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' -Remap_F32_U16_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' Remap_F32_U8_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' Remap_F32_U8_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)' Remap_F32_U16_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' Remap_F32_U16_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)' +Remap_F32_U8_Replicate_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' +Remap_F32_U8_Constant_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_NEAREST, BORDER_CONSTANT)' +Remap_F32_U16_Replicate_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' +Remap_F32_U16_Constant_Nearest_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_NEAREST, BORDER_CONSTANT)' +Remap_F32_U8_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' +Remap_F32_U8_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' +Remap_F32_U16_Replicate_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' +Remap_F32_U16_Constant_Linear: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' +Remap_F32_U8_Replicate_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' +Remap_F32_U8_Constant_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC2, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' +Remap_F32_U16_Replicate_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_LINEAR, BORDER_REPLICATE)' +Remap_F32_U16_Constant_Linear_2ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC2, 32FC1, INTER_LINEAR, BORDER_CONSTANT)' WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 1)' WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 1)' diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index d3bde250c..bea82b90b 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -37,11 +37,12 @@ static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, kleidicv_border_type_t border_type, const ScalarType *border_value) { if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { - x = std::clamp(x, 0, static_cast(src.width()) - 1); + x = std::clamp( + x, 0, static_cast(src.width() / src.channels()) - 1); y = std::clamp(y, 0, static_cast(src.height()) - 1); } else { assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); - if (x >= static_cast(src.width()) || + if (x * src.channels() >= src.width() || y >= static_cast(src.height()) || x < 0 || y < 0) { return border_value; } @@ -216,7 +217,6 @@ static const auto &get_borders() { static const T border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {4, 5, 6, 7}; static const std::array borders{ P{KLEIDICV_BORDER_TYPE_REPLICATE, nullptr}, - P{KLEIDICV_BORDER_TYPE_REPLICATE, border_value}, P{KLEIDICV_BORDER_TYPE_CONSTANT, border_value}, }; return borders; @@ -970,28 +970,29 @@ class RemapF32 : public testing::Test { test::Array2D expected{dst_total_width, dst_h, padding, channels}; - // Initalize the edges only + // Initalize the four corners only const int64_t kMaxVal = std::numeric_limits::max() * 3 / 4; const int64_t kMinVal = std::numeric_limits::lowest() + kMaxVal / 3; auto generateSource = [&](size_t x, size_t y) { return static_cast((x + y) % 2 ? kMaxVal : kMinVal); }; - for (size_t y = 0; y < src_h; ++y) { + for (size_t y = 0; y < 2; ++y) { *source.at(y, 0) = generateSource(y, 0); *source.at(y, 1) = generateSource(y, 1); *source.at(y, 2) = generateSource(y, 2); *source.at(y, src_w - 3) = generateSource(y, src_w - 3); *source.at(y, src_w - 2) = generateSource(y, src_w - 2); *source.at(y, src_w - 1) = generateSource(y, src_w - 1); - } - for (size_t x = 0; x < src_w; ++x) { - *source.at(0, x) = generateSource(0, x); - *source.at(1, x) = generateSource(1, x); - *source.at(2, x) = generateSource(2, x); - *source.at(src_h - 3, x) = generateSource(src_h - 3, x); - *source.at(src_h - 2, x) = generateSource(src_h - 2, x); - *source.at(src_h - 1, x) = generateSource(src_h - 1, x); + *source.at(src_h - y - 1, 0) = generateSource(src_h - y - 1, 0); + *source.at(src_h - y - 1, 1) = generateSource(src_h - y - 1, 1); + *source.at(src_h - y - 1, 2) = generateSource(src_h - y - 1, 2); + *source.at(src_h - y - 1, src_w - 3) = + generateSource(src_h - y - 1, src_w - 3); + *source.at(src_h - y - 1, src_w - 2) = + generateSource(src_h - y - 1, src_w - 2); + *source.at(src_h - y - 1, src_w - 1) = + generateSource(src_h - y - 1, src_w - 1); } test::PseudoRandomNumberGenerator generator; @@ -1000,13 +1001,12 @@ class RemapF32 : public testing::Test { calculate_expected(source, mapx, mapy, border_type, border_value, interpolation, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_f32()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(), - interpolation, border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_f32()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapx.data(), mapx.stride(), mapy.data(), + mapy.stride(), interpolation, border_type, border_value)); if (expected.compare_to(actual, 1)) { if (source.width() < 100 && source.height() < 100) { @@ -1048,13 +1048,12 @@ class RemapF32 : public testing::Test { calculate_expected(source, mapx, mapy, border_type, border_value, interpolation, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_f32()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapx.data(), mapx.stride(), mapy.data(), mapy.stride(), - interpolation, border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_f32()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapx.data(), mapx.stride(), mapy.data(), + mapy.stride(), interpolation, border_type, border_value)); if (expected.compare_to(actual, 1)) { if (source.width() < 100 && source.height() < 100) { @@ -1142,14 +1141,15 @@ TYPED_TEST(RemapF32, RandomNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - size_t channels = 1; size_t padding = 0; - for (auto [border_type, border_value] : get_borders()) { - for (auto interpolation : - {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { - TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, interpolation, - padding); + for (size_t channels = 1; channels <= 2; ++channels) { + for (auto [border_type, border_value] : get_borders()) { + for (auto interpolation : + {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { + TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, interpolation, + padding); + } } } } @@ -1159,13 +1159,15 @@ TYPED_TEST(RemapF32, BlendPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - size_t channels = 1; size_t padding = 13; - for (auto [border_type, border_value] : get_borders()) { - for (auto interpolation : - {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, interpolation, padding); + for (size_t channels = 2; channels <= 2; ++channels) { + for (auto [border_type, border_value] : get_borders()) { + for (auto interpolation : + {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, interpolation, + padding); + } } } } @@ -1175,14 +1177,15 @@ TYPED_TEST(RemapF32, OutsideRandomPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - size_t channels = 1; size_t padding = 13; - for (auto [border_type, border_value] : get_borders()) { - for (auto interpolation : - {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, interpolation, - padding); + for (size_t channels = 1; channels <= 2; ++channels) { + for (auto [border_type, border_value] : get_borders()) { + for (auto interpolation : + {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { + TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, + interpolation, padding); + } } } } @@ -1192,13 +1195,15 @@ TYPED_TEST(RemapF32, BlendBigStride) { size_t src_h = 2; size_t dst_w = src_w; size_t dst_h = src_h; - size_t channels = 1; size_t padding = 1 << 16; - for (auto [border_type, border_value] : get_borders()) { - for (auto interpolation : - {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, interpolation, padding); + for (size_t channels = 1; channels <= 2; ++channels) { + for (auto [border_type, border_value] : get_borders()) { + for (auto interpolation : + {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, interpolation, + padding); + } } } } @@ -1208,14 +1213,15 @@ TYPED_TEST(RemapF32, CornerCases) { size_t src_h = (1ULL << 12) - 1; size_t dst_w = 4; size_t dst_h = 3 * test::Options::vector_lanes() - 1; - size_t channels = 1; size_t padding = 17; - for (auto [border_type, border_value] : get_borders()) { - for (auto interpolation : - {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, interpolation, - padding); + for (size_t channels = 1; channels <= 2; ++channels) { + for (auto [border_type, border_value] : get_borders()) { + for (auto interpolation : + {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, interpolation, + padding); + } } } } @@ -1227,14 +1233,15 @@ TYPED_TEST(RemapF32, CornerCasesLargeLoad) { size_t src_h = 1ULL << 14; size_t dst_w = 3 * test::Options::vector_lanes() - 1; size_t dst_h = 4; - size_t channels = 1; size_t padding = 1; - for (auto [border_type, border_value] : get_borders()) { - for (auto interpolation : - {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, interpolation, - padding); + for (size_t channels = 1; channels <= 2; ++channels) { + for (auto [border_type, border_value] : get_borders()) { + for (auto interpolation : + {KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_INTERPOLATION_NEAREST}) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, interpolation, + padding); + } } } } @@ -1249,17 +1256,18 @@ TYPED_TEST(RemapF32, NullPointer) { const size_t dst_stride = dst_width * element_size; const TypeParam src[4] = {}; TypeParam dst[1]; - const size_t channels = 1; float mapx[1] = {}; const size_t mapx_stride = dst_width * sizeof(float); float mapy[1] = {}; const size_t mapy_stride = dst_width * sizeof(float); const TypeParam border_value[1] = {}; - test::test_null_args(remap_f32(), src, src_stride, src_width, - src_height, dst, dst_stride, dst_width, dst_height, - channels, mapx, mapx_stride, mapy, mapy_stride, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_CONSTANT, border_value); + for (size_t channels = 1; channels <= 2; ++channels) { + test::test_null_args(remap_f32(), src, src_stride, src_width, + src_height, dst, dst_stride, dst_width, dst_height, + channels, mapx, mapx_stride, mapy, mapy_stride, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, border_value); + } } TYPED_TEST(RemapF32, ZeroHeightImage) { @@ -1288,7 +1296,7 @@ TYPED_TEST(RemapF32, ZeroHeightImage) { border_type, border_value)); } const TypeParam border_value[1] = {0}; - EXPECT_EQ(KLEIDICV_ERROR_RANGE, + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_f32()( src, src_stride, kW, 0, dst, dst_stride, kW, 1, 1, mapx, mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, @@ -1333,13 +1341,13 @@ TYPED_TEST(RemapF32, InvalidImageSize) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(RemapF32, UnsupportedTwoChannels) { +TYPED_TEST(RemapF32, UnsupportedThreeChannels) { const size_t element_size = sizeof(TypeParam); const TypeParam src[1] = {}; TypeParam dst[16]; float mapx[16] = {}; float mapy[16] = {}; - const size_t channels = 2; + const size_t channels = 3; EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, @@ -1388,7 +1396,7 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceWidth) { float mapx[16] = {}; float mapy[16] = {}; - EXPECT_EQ(KLEIDICV_ERROR_RANGE, + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_f32()(src, element_size, 1ULL << 24, 1, dst, 16 * element_size, 16, 1, 1, mapx, 16 * sizeof(float), mapy, 16 * sizeof(float), @@ -1410,7 +1418,7 @@ TYPED_TEST(RemapF32, UnsupportedBigSourceHeight) { float mapx[16] = {}; float mapy[16] = {}; - EXPECT_EQ(KLEIDICV_ERROR_RANGE, + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_f32()(src, element_size, 1, 1ULL << 24, dst, 16 * element_size, 16, 1, 1, mapx, 16 * sizeof(float), mapy, 16 * sizeof(float), @@ -1433,7 +1441,7 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationWidth) { float mapy[16] = {}; EXPECT_EQ( - KLEIDICV_ERROR_RANGE, + KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, 1ULL << 24, 1, 1, mapx, 16 * sizeof(float), mapy, 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, @@ -1448,7 +1456,7 @@ TYPED_TEST(RemapF32, UnsupportedBigDestinationHeight) { float mapy[16] = {}; EXPECT_EQ( - KLEIDICV_ERROR_RANGE, + KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_f32()(src, element_size, 1, 1, dst, 16 * element_size, 16, 1ULL << 24, 1, mapx, 16 * sizeof(float), mapy, 16 * sizeof(float), KLEIDICV_INTERPOLATION_LINEAR, diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 128cf7af7..9a2b9affa 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -766,6 +766,9 @@ TEST_P(Thread, remap_f32_u8_border_replicate) { check_remap_f32(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); + check_remap_f32(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8, + 2, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } TEST_P(Thread, remap_f32_u8_border_constant) { @@ -773,12 +776,18 @@ TEST_P(Thread, remap_f32_u8_border_constant) { check_remap_f32(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); + check_remap_f32(kleidicv_remap_f32_u8, kleidicv_thread_remap_f32_u8, + 2, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } TEST_P(Thread, remap_f32_u8_not_implemented) { const uint8_t border_value = 0; check_remap_f32_not_implemented( - kleidicv_thread_remap_f32_u8, 2, KLEIDICV_INTERPOLATION_LINEAR, + kleidicv_thread_remap_f32_u8, 3, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u8, 4, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); check_remap_f32_not_implemented( kleidicv_thread_remap_f32_u8, 1, KLEIDICV_INTERPOLATION_LINEAR, @@ -789,6 +798,9 @@ TEST_P(Thread, remap_f32_u16_border_replicate) { check_remap_f32( kleidicv_remap_f32_u16, kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); + check_remap_f32( + kleidicv_remap_f32_u16, kleidicv_thread_remap_f32_u16, 2, + KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } TEST_P(Thread, remap_f32_u16_border_constant) { @@ -797,12 +809,19 @@ TEST_P(Thread, remap_f32_u16_border_constant) { kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); + check_remap_f32(kleidicv_remap_f32_u16, + kleidicv_thread_remap_f32_u16, 2, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } TEST_P(Thread, remap_f32_u16_not_implemented) { const uint16_t border_value = 0; check_remap_f32_not_implemented( - kleidicv_thread_remap_f32_u16, 2, KLEIDICV_INTERPOLATION_LINEAR, + kleidicv_thread_remap_f32_u16, 3, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); + check_remap_f32_not_implemented( + kleidicv_thread_remap_f32_u16, 4, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, &border_value); check_remap_f32_not_implemented( kleidicv_thread_remap_f32_u16, 1, KLEIDICV_INTERPOLATION_LINEAR, -- GitLab