diff --git a/CHANGELOG.md b/CHANGELOG.md index 828d9009220c483d53d56563ea8c9eafdbbc93a1..7d4665637c9ca779fd75f4d1f5151993f80f750d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,16 +17,16 @@ This changelog aims to follow the guiding principles of ### Added - Implementation of Rotate 90 degrees clockwise. -- Remap implementations for u8 and u16 images - - Integer coordinates with nearest neighbour method (1 channel only) - - Replicated and constant borders - - Fixed-point coordinates with linear interpolation - - 1 channel with Replicated and constant borders - - 4 channels with Replicated borders only - - Floating-point coordinates with nearest neighbour and linear interpolation (1 and 2 channels) - - Replicated and constant borders -- WarpPerspective implementation - - Nearest and Linear interpolation method, for 1-channel u8 input. +- Remap implementations for u8 and u16 images, Replicated and Constant borders + - Integer coordinates with Nearest Neighbour method + - 1 channel only + - Fixed-point coordinates with Linear interpolation + - 1 and 4 channels + - Floating-point coordinates with Nearest Neighbour and Linear interpolation + - 1 and 2 channels +- WarpPerspective implementation for 1-channel u8 input + - Nearest Neighbour and Linear interpolation methods + - Replicated and Constant borders ### Changed - Increased precision of sum for 32 bit floats and expose it to OpenCV HAL. diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index 566caf88fb0f12b7027db51692dc52500f05765a..54e5de999dff1f727765e8c246307839349dcf12 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -259,7 +259,9 @@ std::vector& remap_tests_get() { TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint8 Constant 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Constant 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapF32 uint8 Replicate Linear 1ch", (test_remap_f32), (exec_remap_f32)), TEST("RemapF32 uint16 Replicate Linear 1ch", (test_remap_f32), (exec_remap_f32)), diff --git a/doc/functionality.md b/doc/functionality.md index 62bc8417046615690b7d76618930370c0a9118a9..095a8d6b42a9f1d62ba3c33cc1898ce304a22c13 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -96,12 +96,10 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | | 1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 | 4ch u8 | 4ch u16 | |---------------------------------------------------|---------|---------|--------|---------|--------|---------| | Remap int16 coordinates | x | x | | | | | -| Remap int16+uint16 fixed-point coordinates | x | x | | | R | R | +| Remap int16+uint16 fixed-point coordinates | x | x | | | x | x | | Remap float32 coordinates - nearest interpolation | x | x | x | x | | | | Remap float32 coordinates - linear interpolation | x | x | x | x | | | -R = Replicated borders only - # WarpPerspective | | u8 | |----------------------|-----| diff --git a/doc/opencv.md b/doc/opencv.md index f70aab86f63c50d630dffee2a1d7dc06c3514b3a..5195dac274daec73d4d094f1dd0b8e157ee0fac0 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -210,7 +210,7 @@ Notes on parameters: * `src.width`, `src_height` - must not be greater than 32768. * `src.type()` - supports `CV_8UC1` and `CV_16UC1` with all map configs * additionally, with `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well. - * additionally, with `CV_16SC2` plus `CV_16UC1` map config and `BORDER_REPLICATE`, it supports `CV_8UC4` and `CV_16UC4` + * additionally, with `CV_16SC2` plus `CV_16UC1` map config, it supports `CV_8UC4` and `CV_16UC4` * `dst.cols` - must be at least 4 (32FC1-type maps) or 8 (16SC2-type maps) * `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 2c4e1dff5ce8207415680c7fa9d47d2e3a771a0a..8c76910ad8c1a12271c91d1acd54d16a56d9ccb6 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1786,11 +1786,9 @@ KLEIDICV_API_DECLARATION(kleidicv_in_range_f32, const float *src, /// start of the next row for the destination data. /// Must be a multiple of `sizeof(int16_t)` and no less than /// `width * sizeof(int16_t)`, except for single-row images. -/// @param channels Number of channels in the data: \n -/// - Must be 1 for constant border. -// - Must be 1 or 4 for replicate border. -/// @param border_type Way of handling the border. The supported border types -/// are: \n +/// @param channels Number of channels in the data. Must be 1. +/// @param border_type Way of handling the border. The supported border types +/// are: \n /// - @ref KLEIDICV_BORDER_TYPE_CONSTANT /// - @ref KLEIDICV_BORDER_TYPE_REPLICATE /// @param border_value Border value if the border_type is @@ -1816,6 +1814,9 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16_u16, const uint16_t *src, /// Internal - not part of the public API and its direct use is not supported. /// Functionality is similar to @ref kleidicv_remap_s16_u8 , the difference is /// in the data format: it contains a fractional part with 5+5 bits (`mapfrac`). +/// Other difference: +/// @param channels Number of channels in the data. +/// - Supported values: 1 and 4. KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u8, const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 6f807c4b7b8f22a241fa2c906ea687bd9fe80ae7..51bb2e51e1d6dac9a9c305542a4eab2f3fbd76da 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -45,8 +45,7 @@ inline bool remap_s16point5_is_implemented( src_height <= std::numeric_limits::max() + 1 && (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && - (channels == 1 || - (channels == 4 && border_type == KLEIDICV_BORDER_TYPE_REPLICATE))); + (channels == 1 || channels == 4)); } else { return false; } diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp index 32562c519a6cecea4c52ea41e08eebe9362ed62d..40d8fa85acc1110a1c2e28fa76106b5ad5eb30c9 100644 --- a/kleidicv/src/transform/remap_s16point5_neon.cpp +++ b/kleidicv/src/transform/remap_s16point5_neon.cpp @@ -854,6 +854,356 @@ class RemapS16Point5Replicate4ch { int16x8_t v_ymax_; }; // end of class RemapS16Point5Replicate4ch +// Constant border specific functions +inline void get_coordinates_constant( + Columns mapxy, Columns mapfrac, + uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0, + uint16x8_t &x1, uint16x8_t &y1, uint16x8_t &xfrac, uint16x8_t &yfrac, + uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c, + uint16x8_t &in_range_d) { + get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); + + uint16x8_t one = vdupq_n_u16(1); + x1 = vaddq_u16(x0, one); + y1 = vaddq_u16(y0, one); + + uint16x8_t x0_in_range = vcltq_u16(x0, v_width); + uint16x8_t y0_in_range = vcltq_u16(y0, v_height); + uint16x8_t x1_in_range = vcltq_u16(x1, v_width); + uint16x8_t y1_in_range = vcltq_u16(y1, v_height); + + in_range_a = vandq(x0_in_range, y0_in_range); + in_range_b = vandq(x1_in_range, y0_in_range); + in_range_c = vandq(x0_in_range, y1_in_range); + in_range_d = vandq(x1_in_range, y1_in_range); +} + +inline uint32x4_t zero_out_of_range_offsets(uint32x4_t in_range, + uint32x4_t offsets) { + return vbslq_u32(in_range, offsets, vdupq_n_u32(0)); +} + +inline uint8x16_t replace_pixel_with_border_u8_4ch(uint32x4_t in_range, + uint8x16_t pixels, + uint8x16_t v_border) { + return vreinterpretq_u8_u32( + vbslq_u32(in_range, vreinterpretq_u32_u8(pixels), v_border)); +} + +inline uint16x8_t replace_pixel_with_border_u16_4ch(uint64x2_t in_range, + uint16x8_t pixels, + uint16x8_t v_border) { + return vreinterpretq_u16_u64( + vbslq_u64(in_range, vreinterpretq_u64_u16(pixels), v_border)); +} + +inline void load_pixels_u8_4ch_constant( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a, + uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d, + uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c, + uint8x16_t &d) { + offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); + offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); + offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); + offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); + + a = load_4px_4ch(src_rows, offsets_a); + b = load_4px_4ch(src_rows, offsets_b); + c = load_4px_4ch(src_rows, offsets_c); + d = load_4px_4ch(src_rows, offsets_d); + + a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border); + b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border); + c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border); + d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border); +} + +inline void load_pixels_u16_4ch_constant( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint32x4_t in_range_a, + uint32x4_t in_range_b, uint32x4_t in_range_c, uint32x4_t in_range_d, + uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo, + uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo, + uint16x8_t &d_hi) { + offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); + offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); + offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); + offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); + + a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); + b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); + c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); + d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); + + // Convert bitsets such as in_range to 64bits, making all 1s or all 0s + auto low32_to_u64 = [](uint32x4_t bitset) { + return vreinterpretq_u64_s64( + vmovl_s32(vreinterpret_s32_u32(vget_low_u32(bitset)))); + }; + + a_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_a), a_lo, + v_border); + b_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_b), b_lo, + v_border); + c_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_c), c_lo, + v_border); + d_lo = replace_pixel_with_border_u16_4ch(low32_to_u64(in_range_d), d_lo, + v_border); + + a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); + b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); + c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); + d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); + + // Convert bitsets such as in_range to 64bits, making all 1s or all 0s + auto hi32_to_u64 = [](uint32x4_t bitset) { + return vreinterpretq_u64_s64(vmovl_high_s32(vreinterpretq_s32_u32(bitset))); + }; + + a_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_a), a_hi, + v_border); + b_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_b), b_hi, + v_border); + c_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_c), c_hi, + v_border); + d_hi = replace_pixel_with_border_u16_4ch(hi32_to_u64(in_range_d), d_hi, + v_border); +} + +// Convert bitsets such as in_range to 32bits, making all 1s or all 0s +static uint32x4_t low16_to_s32(uint16x8_t bitset) { + return vreinterpretq_u32_s32( + vmovl_s16(vreinterpret_s16_u16(vget_low_u16(bitset)))); +} + +static uint32x4_t hi16_to_s32(uint16x8_t bitset) { + return vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(bitset))); +} + +template +class RemapS16Point5Constant4ch; + +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{} { + uint32_t border_value_32{}; + memcpy(&border_value_32, border_value, sizeof(uint32_t)); + v_border_ = vreinterpretq_u8_u32(vdupq_n_u32(border_value_32)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint16x8_t xfrac, yfrac; + uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; + get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, + y1, xfrac, yfrac, in_range_a, in_range_b, + in_range_c, in_range_d); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint8x16_t a, b, c, d; + uint8x16x2_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + + load_pixels_u8_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + low16_to_s32(in_range_a), low16_to_s32(in_range_b), + low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a, b, + c, d); + + // Doubled fractions 001122..., low part + uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); + uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); + uint16x8_t nxfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + uint16x8_t nyfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + uint16x8_t res0 = interpolate( + vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)), + vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), + vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); + uint16x8_t res1 = interpolate( + vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), + vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2), + vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2)); + res.val[0] = + vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + + load_pixels_u8_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + hi16_to_s32(in_range_a), hi16_to_s32(in_range_b), + hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a, b, c, + d); + // Doubled fractions 001122..., high part + xfrac2 = vzip2q(xfrac, xfrac); + yfrac2 = vzip2q(yfrac, yfrac); + nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), + vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)), + vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), + vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); + res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), + vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), + vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), + vzip2q(nyfrac2, nyfrac2)); + res.val[1] = + vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); + + store_pixels_u8_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint8x16_t v_border_; +}; // end of class RemapS16Point5Constant4ch + +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{} { + uint64_t border_value_64{}; + memcpy(&border_value_64, border_value, sizeof(uint64_t)); + v_border_ = vreinterpretq_u16_u64(vdupq_n_u64(border_value_64)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint16x8_t xfrac, yfrac; + uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; + get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, + y1, xfrac, yfrac, in_range_a, in_range_b, + in_range_c, in_range_d); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; + uint16x8x4_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + + load_pixels_u16_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + low16_to_s32(in_range_a), low16_to_s32(in_range_b), + low16_to_s32(in_range_c), low16_to_s32(in_range_d), v_border_, a_low, + a_high, b_low, b_high, c_low, c_high, d_low, d_high); + + // Doubled fractions 001122..., low part + uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); + uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); + uint16x8_t nxfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + uint16x8_t nyfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + res.val[0] = + interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), + vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), + vzip1q(nyfrac2, nyfrac2)); + res.val[1] = + interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), + vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), + vzip2q(nyfrac2, nyfrac2)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + + load_pixels_u16_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + hi16_to_s32(in_range_a), hi16_to_s32(in_range_b), + hi16_to_s32(in_range_c), hi16_to_s32(in_range_d), v_border_, a_low, + a_high, b_low, b_high, c_low, c_high, d_low, d_high); + + // Doubled fractions 001122..., high part + xfrac2 = vzip2q(xfrac, xfrac); + yfrac2 = vzip2q(yfrac, yfrac); + nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + res.val[2] = + interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), + vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), + vzip1q(nyfrac2, nyfrac2)); + res.val[3] = + interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), + vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), + vzip2q(nyfrac2, nyfrac2)); + + store_pixels_u16_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_element_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint16x8_t v_border_; +}; // end of class RemapS16Point5Constant4ch + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -891,7 +1241,9 @@ kleidicv_error_t remap_s16point5( zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } else { assert(channels == 4); - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + RemapS16Point5Constant4ch operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); diff --git a/kleidicv/src/transform/remap_s16point5_sve2.cpp b/kleidicv/src/transform/remap_s16point5_sve2.cpp index 982717173f68ac9b10907c3ae17e0aba6f7e2d8c..d9f94012fa53b9495d6da74c148d76ed9cfcbfa8 100644 --- a/kleidicv/src/transform/remap_s16point5_sve2.cpp +++ b/kleidicv/src/transform/remap_s16point5_sve2.cpp @@ -10,6 +10,7 @@ #include #include "kleidicv/sve2.h" +#include "kleidicv/traits.h" #include "kleidicv/transform/remap.h" #include "transform_sve2.h" @@ -870,6 +871,349 @@ class RemapS16Point5Replicate4ch { svint32_t& v_ymax_; }; // end of class RemapS16Point5Replicate4ch +template +class RemapS16Point5Constant4ch; + +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint8_t; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType* border_value, + svuint16_t& v_src_stride, svuint16_t& v_x_max, + svuint16_t& v_y_max, svuint32_t& v_border) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max}, + v_border_{v_border} { + v_src_stride_ = svdup_u16(src_rows.stride()); + v_xmax_ = svdup_u16(static_cast(src_width - 1)); + v_ymax_ = svdup_u16(static_cast(src_height - 1)); + uint32_t border_value_u32{}; + memcpy(&border_value_u32, border_value, sizeof(uint32_t)); + v_border_ = svdup_u32(border_value_u32); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, svcnth()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = svptrue_b16(); + vector_path(pg, mapxy, mapfrac, dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = svwhilelt_b16(step - length, step); + vector_path(pg, mapxy, mapfrac, dst, static_cast(length)); + }); + } + + void vector_path(svbool_t pg, Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + svuint16x2_t xy = + svld2_u16(pg, reinterpret_cast(&mapxy[0])); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + // Negative values become big positive ones + svuint16_t x0 = svget2(xy, 0); + svuint16_t y0 = svget2(xy, 1); + svuint16_t x1 = svadd_n_u16_x(pg, x0, 1); + svuint16_t y1 = svadd_n_u16_x(pg, y0, 1); + + // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 + // channels + auto load_4ch_or_border_b = [&](svuint16_t x, svuint16_t y) { + svbool_t in_range_b16 = + svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); + svbool_t in_range = svtrn1_b16(in_range_b16, svpfalse()); + svuint32_t image = svld1_gather_u32offset_u32( + in_range, reinterpret_cast(&src_rows_[0]), + svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_)); + return svreinterpret_u8_u32(svsel(in_range, image, v_border_)); + }; + auto load_4ch_or_border_t = [&](svuint16_t x, svuint16_t y) { + svbool_t in_range_b16 = + svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); + svbool_t in_range = svtrn2_b16(in_range_b16, svpfalse()); + svuint32_t image = svld1_gather_u32offset_u32( + in_range, reinterpret_cast(&src_rows_[0]), + svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_)); + return svreinterpret_u8_u32(svsel(in_range, image, v_border_)); + }; + + svuint16_t frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t xfrac = + svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint16_t line0 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); + svuint16_t line1 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); + svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); + acc_b = svmlalb_u32(acc_b, line1, yfrac); + acc_t = svmlalt_u32(acc_t, line1, yfrac); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // bottom part + svuint8_t a = load_4ch_or_border_b(x0, y0); + svuint8_t b = load_4ch_or_border_b(x1, y0); + svuint8_t c = load_4ch_or_border_b(x0, y1); + svuint8_t d = load_4ch_or_border_b(x1, y1); + // from xfrac, we need the bottom part twice + svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); + svuint16_t nxfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); + svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); + svuint16_t nyfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_bb = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_bt = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_b = + svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); + + // top part + a = load_4ch_or_border_t(x0, y0); + b = load_4ch_or_border_t(x1, y0); + c = load_4ch_or_border_t(x0, y1); + d = load_4ch_or_border_t(x1, y1); + // from xfrac, we need the top part twice + svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); + svuint16_t nxfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); + svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); + svuint16_t nyfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_tb = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_tt = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_t = + svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); + + svbool_t pg_low = svwhilelt_b32(0L, step); + svbool_t pg_high = svwhilelt_b32(svcntw(), static_cast(step)); + svuint32_t res_low = + svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + svuint32_t res_high = + svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + mapxy += step; + svst1_u32(pg_low, reinterpret_cast(&dst[0]), res_low); + svst1_u32(pg_high, reinterpret_cast(&dst[0]) + svcntw(), + res_high); + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_stride_; + svuint16_t& v_xmax_; + svuint16_t& v_ymax_; + svuint32_t& v_border_; +}; // end of class RemapS16Point5Constant4ch + +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint16_t; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType* border_value, + svuint32_t& v_src_stride, svuint32_t& v_x_max, + svuint32_t& v_y_max, svuint64_t& v_border) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max}, + v_border_{v_border} { + v_src_stride_ = svdup_u32(src_rows.stride()); + v_xmax_ = svdup_u32(static_cast(src_width - 1)); + v_ymax_ = svdup_u32(static_cast(src_height - 1)); + uint64_t border_value_u64{}; + memcpy(&border_value_u64, border_value, sizeof(uint64_t)); + v_border_ = svdup_u64(border_value_u64); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, svcntw()}; + loop.unroll_once([&](size_t step) { + vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), mapxy, mapfrac, + dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = svwhilelt_b32(step, step + length); + svbool_t pg_low = svzip1_b32(pg, svpfalse()); + svbool_t pg_high = svzip2_b32(pg, svpfalse()); + vector_path(pg, pg_low, pg_high, mapxy, mapfrac, dst, + static_cast(length)); + }); + } + + void vector_path(svbool_t pg, svbool_t pg_low, svbool_t pg_high, + Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + // Load one vector of xy: even coordinates are x, odd are y + svint16_t xy = svreinterpret_s16_u32( + svld1_u32(pg, reinterpret_cast(&mapxy[0]))); + + // Negative values become big positive ones + // Widening is signed, so 16-bit -1 becomes 32-bit -1 + svuint32_t x0 = svreinterpret_u32_s32(svmovlb(xy)); + svuint32_t y0 = svreinterpret_u32_s32(svmovlt(xy)); + svuint32_t x1 = svadd_n_u32_x(pg, x0, 1); + svuint32_t y1 = svadd_n_u32_x(pg, y0, 1); + + auto load_4ch_or_border_b = [&](svuint32_t x, svuint32_t y) { + svbool_t in_range_b32 = + svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); + svbool_t in_range = svtrn1_b32(in_range_b32, svpfalse()); + svuint64_t image = svld1_gather_u64offset_u64( + in_range, reinterpret_cast(&src_rows_[0]), + svmlalb_u64(svshllb_n_u64(x, 3), y, v_src_stride_)); + return svreinterpret_u16_u64(svsel(in_range, image, v_border_)); + }; + + auto load_4ch_or_border_t = [&](svuint32_t x, svuint32_t y) { + svbool_t in_range_b32 = + svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); + svbool_t in_range = svtrn2_b32(in_range_b32, svpfalse()); + svuint64_t image = svld1_gather_u64offset_u64( + in_range, reinterpret_cast(&src_rows_[0]), + svmlalt_u64(svshllt_n_u64(x, 3), y, v_src_stride_)); + return svreinterpret_u16_u64(svsel(in_range, image, v_border_)); + }; + + svuint16_t xfrac, yfrac, nxfrac, nyfrac; + { + // Fractions are loaded into even lanes + svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0])); + + // Fractions are doubled, 00112233... (will be doubled again later) + svuint16_t frac = svtrn1(rawfrac, rawfrac); + + xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); + nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); + } + + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); + svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); + svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); + svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = + svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); + svuint32_t acc_t = + svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); + acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); + acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit) + // Calculation is done in 2 parts, top and bottom + svuint16_t res_b, res_t; + + { // bottom + svuint16_t a = load_4ch_or_border_b(x0, y0); + svuint16_t b = load_4ch_or_border_b(x1, y0); + svuint16_t c = load_4ch_or_border_b(x0, y1); + svuint16_t d = load_4ch_or_border_b(x1, y1); + + // Copy even lanes twice -> 000022224444... these are the "bottom" + // fractions + svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); + svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); + svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); + svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); + + res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + { // top + svuint16_t a = load_4ch_or_border_t(x0, y0); + svuint16_t b = load_4ch_or_border_t(x1, y0); + svuint16_t c = load_4ch_or_border_t(x0, y1); + svuint16_t d = load_4ch_or_border_t(x1, y1); + + // Copy odd lanes twice -> 111133335555... these are the "top" + // fractions + svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); + svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); + svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); + svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); + + res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + svuint64_t res_low = + svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); + svuint64_t res_high = + svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); + svst1_u64(pg_low, reinterpret_cast(&dst[0]), res_low); + svst1_u64(pg_high, reinterpret_cast(&dst[0]) + svcntd(), + res_high); + mapxy += step; + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint32_t& v_src_stride_; + svuint32_t& v_xmax_; + svuint32_t& v_ymax_; + svuint64_t& v_border_; +}; // end of class RemapS16Point5Constant4ch + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -904,15 +1248,23 @@ kleidicv_error_t remap_s16point5(const T* src, size_t src_stride, Rectangle rect{dst_width, dst_height}; if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - svuint16_t sv_width, sv_height, sv_border; if (channels == 1) { + svuint16_t sv_width, sv_height, sv_border; RemapS16Point5ConstantBorder operation{ src_rows, src_width, src_height, border_value, sv_src_stride, sv_width, sv_height, sv_border}; zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } else { assert(channels == 4); - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + typedef typename double_element_width::type DoubleType; + typedef typename double_element_width::type QuadType; + typename VecTraits::VectorType sv_width, sv_height, + sv_src_stride; + typename VecTraits::VectorType sv_border; + RemapS16Point5Constant4ch operation{ + src_rows, src_width, src_height, border_value, + sv_src_stride, sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index c293ef1cb3ae49ef949d3dcee321ca54ec54cdcf..158505944652653397d7bf1701771ccf717a4632 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -85,9 +85,11 @@ Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 1 Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U8_Replicate_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC4, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' +Remap_S16Point5_U8_Constant_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC4, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U16_Replicate_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC4, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' +Remap_S16Point5_U16_Constant_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC4, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' Remap_F32_U8_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' Remap_F32_U8_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)' Remap_F32_U16_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index e85b611bc23e77f395021e18c93c05afbfd19d26..435a5ae486ce88d8d8dcfc93ec9f307ce5633afd 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -35,7 +35,8 @@ KLEIDICV_REMAP_F32(uint16_t, u16); template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, - kleidicv_border_type_t border_type, const ScalarType *border_value) { + ptrdiff_t ch, kleidicv_border_type_t border_type, + const ScalarType *border_value) { // Width is the number of pixels in a row, but Array2D does not handle that const ptrdiff_t src_width = static_cast(src.width() / src.channels()); @@ -47,10 +48,10 @@ static const ScalarType *get_array2d_element_or_border( } else { assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); if (x >= src_width || y >= src_height || x < 0 || y < 0) { - return border_value; + return border_value + ch; } } - return src.at(y, x * src.channels()); + return src.at(y, x * src.channels() + ch); } template @@ -191,9 +192,9 @@ class RemapS16 : public testing::Test { kleidicv_border_type_t border_type, const ScalarType *border_value, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -202,7 +203,7 @@ class RemapS16 : public testing::Test { for (size_t ch = 0; ch < src.channels(); ++ch) { const int16_t *coords = mapxy.at(row, column * 2); int16_t x = coords[0], y = coords[1]; - *expected.at(row, column * src.channels() + ch) = get_src(x, y)[ch]; + *expected.at(row, column * src.channels() + ch) = *get_src(x, y, ch); } } } @@ -599,8 +600,12 @@ class RemapS16Point5 : public testing::Test { test::Array2D actual{dst_total_width, dst_h, padding, channels}; test::Array2D expected{dst_total_width, dst_h, padding, channels}; - test::PseudoRandomNumberGenerator generator; - source.fill(generator); + ScalarType counter = 0; + for (size_t y = 0; y < src_h; ++y) { + for (size_t x = 0; x < src_total_width; ++x) { + *source.at(y, x) = ++counter; + } + } actual.fill(42); calculate_expected(source, mapxy, mapfrac, border_type, border_value, @@ -647,9 +652,9 @@ class RemapS16Point5 : public testing::Test { kleidicv_border_type_t border_type, const ScalarType *border_value, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -666,9 +671,9 @@ class RemapS16Point5 : public testing::Test { const int16_t *coords = mapxy.at(row, column * 2); ptrdiff_t x = coords[0], y = coords[1]; for (size_t ch = 0; ch < src.channels(); ++ch) { - *expected.at(row, column * src.channels() + ch) = - lerp_2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], - get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); + *expected.at(row, column * src.channels() + ch) = lerp_2d( + x_frac, y_frac, *get_src(x, y, ch), *get_src(x + 1, y, ch), + *get_src(x, y + 1, ch), *get_src(x + 1, y + 1, ch)); } } } @@ -693,14 +698,15 @@ TYPED_TEST(RemapS16Point5, RandomNoPadding) { } } -// TODO: Modify tests to also run constant border once implemented -TYPED_TEST(RemapS16Point5, RandomNoPadding4chReplicate) { +TYPED_TEST(RemapS16Point5, RandomNoPadding4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = 0; - TestFixture::test_random(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_random(w, h, w, h, channels, border_type, border_value, + padding); + } } TYPED_TEST(RemapS16Point5, BlendPadding) { @@ -711,13 +717,15 @@ TYPED_TEST(RemapS16Point5, BlendPadding) { } } -TYPED_TEST(RemapS16Point5, BlendPadding4chReplicate) { +TYPED_TEST(RemapS16Point5, BlendPadding4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = 7; - TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(w, h, w, h, channels, border_type, border_value, + padding); + } } TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { @@ -729,13 +737,15 @@ TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { } } -TYPED_TEST(RemapS16Point5, OutsideRandomPadding4chReplicate) { +TYPED_TEST(RemapS16Point5, OutsideRandomPadding4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = 11; - TestFixture::test_outside_random( - w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_outside_random(w, h, w, h, channels, border_type, + border_value, padding); + } } TYPED_TEST(RemapS16Point5, BlendBigStride) { @@ -751,14 +761,16 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) { } } -TYPED_TEST(RemapS16Point5, BlendBigStride4chReplicate) { +TYPED_TEST(RemapS16Point5, BlendBigStride4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = std::numeric_limits::max() / channels - w * channels; - TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(w, h, w, h, channels, border_type, border_value, + padding); + } } TYPED_TEST(RemapS16Point5, CornerCases) { @@ -781,9 +793,10 @@ TYPED_TEST(RemapS16Point5, CornerCases4ch) { size_t dst_h = defaultHeight(); size_t channels = 4; size_t padding = 17; - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, - padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); + } } TYPED_TEST(RemapS16Point5, NullPointer) { @@ -922,18 +935,6 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REFLECT, src)); } -TYPED_TEST(RemapS16Point5, UnsupportedConstantBorder4ch) { - const TypeParam src[1] = {}; - TypeParam dst[8]; - int16_t mapxy[16] = {}; - uint16_t mapfrac[8] = {}; - - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_s16point5()(src, sizeof(TypeParam), 1, 1, dst, 8, - 8, 1, 4, mapxy, 4, mapfrac, 2, - KLEIDICV_BORDER_TYPE_CONSTANT, src)); -} - TYPED_TEST(RemapS16Point5, UnsupportedBigStride4ch) { const TypeParam src[1] = {}; TypeParam dst[8]; @@ -1185,9 +1186,9 @@ class RemapF32 : public testing::Test { const ScalarType *border_value, kleidicv_interpolation_type_t interpolation, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -1208,10 +1209,10 @@ class RemapF32 : public testing::Test { float xfrac = x - std::floor(x); float yfrac = y - std::floor(y); for (size_t ch = 0; ch < src.channels(); ++ch) { - float a = get_src(ix, iy)[ch]; - float b = get_src(ix + 1, iy)[ch]; - float c = get_src(ix, iy + 1)[ch]; - float d = get_src(ix + 1, iy + 1)[ch]; + float a = *get_src(ix, iy, ch); + float b = *get_src(ix + 1, iy, ch); + float c = *get_src(ix, iy + 1, ch); + float d = *get_src(ix + 1, iy + 1, ch); float line1 = (b - a) * xfrac + a; float line2 = (d - c) * xfrac + c; float float_result = (line2 - line1) * yfrac + line1; @@ -1230,7 +1231,7 @@ class RemapF32 : public testing::Test { static_cast(KLEIDICV_MAX_IMAGE_PIXELS)))); for (size_t ch = 0; ch < src.channels(); ++ch) { *expected.at(row, column * src.channels() + ch) = - get_src(ix, iy)[ch]; + *get_src(ix, iy, ch); } } } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index c16f6356b6bc7f04139833be2d39036ce1d54f35..9e6d18685dd7918edb2507fb32771cb12318bdf2 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -747,6 +747,13 @@ TEST_P(Thread, remap_s16point5_u8_border_constant) { KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } +TEST_P(Thread, remap_s16point5_u8_border_constant_4ch) { + const uint8_t border_value[] = {1, 2, 3, 4}; + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 4, + KLEIDICV_BORDER_TYPE_CONSTANT, border_value); +} + TEST_P(Thread, remap_s16point5_u16_border_constant) { const uint16_t border_value = 0; check_remap_s16point5(kleidicv_remap_s16point5_u16, @@ -754,6 +761,13 @@ TEST_P(Thread, remap_s16point5_u16_border_constant) { KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } +TEST_P(Thread, remap_s16point5_u16_border_constant_4ch) { + const uint16_t border_value[] = {1, 2, 3, 4}; + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 4, + KLEIDICV_BORDER_TYPE_CONSTANT, border_value); +} + TEST_P(Thread, remap_s16point5_u8_not_implemented) { const uint8_t border_value = 0; check_remap_s16point5_not_implemented(