From 5f2b75b58b581bb74aad04c6cb12287bc215d500 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Mon, 27 Jan 2025 17:17:56 +0000 Subject: [PATCH] Add u16 data format to Remap linear interpolation --- CHANGELOG.md | 2 +- adapters/opencv/kleidicv_hal.cpp | 17 +- benchmark/benchmark.cpp | 16 + conformity/opencv/test_remap.cpp | 2 + doc/functionality.md | 2 +- doc/opencv.md | 3 +- kleidicv/include/kleidicv/kleidicv.h | 10 + kleidicv/include/kleidicv/transform/remap.h | 6 +- kleidicv/src/transform/remap_api.cpp | 5 + kleidicv/src/transform/remap_neon.cpp | 346 ++++++++++++++++++ kleidicv/src/transform/remap_sc.h | 261 +++++++++++++ kleidicv/src/transform/remap_sve2.cpp | 9 +- .../include/kleidicv_thread/kleidicv_thread.h | 12 + kleidicv_thread/src/kleidicv_thread.cpp | 23 ++ scripts/benchmark/benchmarks.txt | 2 + test/api/test_remap.cpp | 172 ++++++--- test/api/test_thread.cpp | 81 +++- 17 files changed, 884 insertions(+), 85 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14816afce..36e85ca33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ This changelog aims to follow the guiding principles of - Implementation of Rotate 90 degrees clockwise. - Remap implementations - Nearest neighbour, for replicated borders with 1-channel u8 and u16 inputs. - - Fixed-point interpolation, for replicated borders with 1-channel u8 input. + - Fixed-point interpolation, for replicated borders with 1-channel u8 and u16 inputs. - WarpPerspective implementation - Nearest and Linear interpolation method, for 1-channel u8 input. diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 2fa3c244d..addbd7281 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1352,14 +1352,23 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, return CV_HAL_ERROR_NOT_IMPLEMENTED; } - auto border_value = get_border_value(border_value_f64); - auto mt = get_multithreading(); if (src_type == CV_8UC1) { + auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_s16point5_u8( - src_data, src_step, static_cast(src_width), - static_cast(src_height), dst_data, dst_step, + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 1, + mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, + border_value.data(), mt)); + } else if (src_type == CV_16UC1) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_s16point5_u16( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, static_cast(dst_width), static_cast(dst_height), 1, mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, border_value.data(), mt)); diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 2bab77141..9d8e980e4 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -690,6 +690,22 @@ BENCH_REMAP_S16POINT5(remap_s16point5_u8_identity, remap_s16point5_u8, get_identity_mapxy, 1, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); +BENCH_REMAP_S16POINT5(remap_s16point5_u16_random, remap_s16point5_u16, + get_random_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + +BENCH_REMAP_S16POINT5(remap_s16point5_u16_blend, remap_s16point5_u16, + get_blend_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + +BENCH_REMAP_S16POINT5(remap_s16point5_u16_flip, remap_s16point5_u16, + get_flip_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + +BENCH_REMAP_S16POINT5(remap_s16point5_u16_identity, remap_s16point5_u16, + get_identity_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint16_t); + // clang-format off static const float transform_identity[] = { 1.0, 0, 0, diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index c60789cc6..6034acc28 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -137,10 +137,12 @@ std::vector& remap_tests_get() { TEST("RemapS16 uint8 Replicate", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16 uint16 Replicate", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16 uint8 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 9cc895415..2f2e2eed5 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -96,7 +96,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | | u8 | u16 | |--------------------------------------------|-----|-----| | Remap int16 coordinates | x | x | -| Remap int16+uint16 fixed-point coordinates | x | | +| Remap int16+uint16 fixed-point coordinates | x | x | # WarpPerspective | | u8 | diff --git a/doc/opencv.md b/doc/opencv.md index 5bef02c0a..b50b5bc5f 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -212,8 +212,7 @@ Notes on parameters: * `src.step` - must be less than 2^16 * `element size` * `src.width`, `src_height` - must not be bigger than 2^15 * `src.depth()` - * Supports `CV_8U` and `CV_16U` depths and 1 channel when prodiving only an integer map (`map1`). - * Supports `CV_8U` depth and 1 channel when providing integer + fractional maps (`map1` and `map2`). + * Supports `CV_8U` and `CV_16U` depths and 1 channel. * `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. \ Supported map configurations: * `map1` is 16SC2: channel #1 is x coordinate (column) and channel #2 is y (row) diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 75e7b3511..9c6cdfa1d 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1822,6 +1822,16 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u8, const uint8_t *src, const uint16_t *mapfrac, size_t mapfrac_stride, kleidicv_border_type_t border_type, const uint8_t *border_value); + +/// @copydoc kleidicv_remap_s16point5_u8 +KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src, + size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, + size_t dst_height, size_t channels, + const int16_t *mapxy, size_t mapxy_stride, + const uint16_t *mapfrac, size_t mapfrac_stride, + kleidicv_border_type_t border_type, + const uint16_t *border_value); #endif // DOXYGEN #ifndef DOXYGEN diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index 759470129..78dd1e8c5 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -23,7 +23,6 @@ inline bool remap_s16_is_implemented( dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && - dst_width >= 8 && (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && channels == 1); @@ -37,8 +36,9 @@ inline bool remap_s16point5_is_implemented( size_t src_stride, size_t src_width, size_t src_height, size_t dst_width, kleidicv_border_type_t border_type, size_t channels) KLEIDICV_STREAMING_COMPATIBLE { - if constexpr (std::is_same::value) { - return (src_stride <= std::numeric_limits::max() && + if constexpr (std::is_same::value || + std::is_same::value) { + return (src_stride / sizeof(T) <= std::numeric_limits::max() && dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && diff --git a/kleidicv/src/transform/remap_api.cpp b/kleidicv/src/transform/remap_api.cpp index 3abc4071f..bda9c17f6 100644 --- a/kleidicv/src/transform/remap_api.cpp +++ b/kleidicv/src/transform/remap_api.cpp @@ -17,3 +17,8 @@ KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16_u16, KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u8, &kleidicv::neon::remap_s16point5, &kleidicv::sve2::remap_s16point5, nullptr); + +KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u16, + &kleidicv::neon::remap_s16point5, + &kleidicv::sve2::remap_s16point5, + nullptr); diff --git a/kleidicv/src/transform/remap_neon.cpp b/kleidicv/src/transform/remap_neon.cpp index d29200e86..a540daf56 100644 --- a/kleidicv/src/transform/remap_neon.cpp +++ b/kleidicv/src/transform/remap_neon.cpp @@ -357,6 +357,178 @@ class RemapS16Point5Replicate { int16x8_t v_ymax_; }; // end of class RemapS16Point5Replicate +template <> +class RemapS16Point5Replicate { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Replicate(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdupq_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))}, + xfrac_{vdupq_n_u16(0)}, + yfrac_{vdupq_n_u16(0)}, + nxfrac_{vdupq_n_u16(0)}, + nyfrac_{vdupq_n_u16(0)}, + x0_{vdupq_n_s16(0)}, + x1_{vdupq_n_s16(0)}, + y0_{vdupq_n_s16(0)}, + y1_{vdupq_n_s16(0)} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + prepare_maps(mapxy, mapfrac); + transform_pixels(dst); + + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + void prepare_maps(Columns mapxy, + Columns mapfrac) { + int16x8x2_t xy = vld2q_s16(&mapxy[0]); + uint16x8_t frac = vld1q_u16(&mapfrac[0]); + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); + xfrac_ = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), + vandq_u16(frac, frac_mask)); + yfrac_ = vbslq_u16( + vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), + vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask)); + nxfrac_ = vsubq_u16(frac_max, xfrac_); + nyfrac_ = vsubq_u16(frac_max, yfrac_); + + // Clamp coordinates to within the dimensions of the source image + x0_ = vreinterpretq_u16_s16( + vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); + y0_ = vreinterpretq_u16_s16( + vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); + + // x1 = x0 + 1, except if it's already xmax + x1_ = vsubq_u16(x0_, vcltq_s16(xy.val[0], v_xmax_)); + y1_ = vsubq_u16(y0_, vcltq_s16(xy.val[1], v_ymax_)); + } + + void transform_pixels(Columns dst) { + uint16x8_t a = load_pixels(x0_, y0_); + uint16x8_t b = load_pixels(x0_, y1_); + uint16x8_t c = load_pixels(x1_, y0_); + uint16x8_t d = load_pixels(x1_, y1_); + + uint16x8_t result = interpolate(a, b, c, d); + + vst1q_u16(&dst[0], result); + } + + uint16x8_t load_pixels(int16x8_t x, int16x8_t y) { + // Clamp coordinates to within the dimensions of the source image + uint16x8_t x_clamped = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(x, vdupq_n_s16(0))), v_xmax_); + uint16x8_t y_clamped = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(y, vdupq_n_s16(0))), v_ymax_); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + uint32x4_t indices_low = + vmlal_u16(vmovl_u16(vget_low_u16(x_clamped)), vget_low_u16(y_clamped), + vget_low_u16(v_src_element_stride_)); + uint32x4_t indices_high = vmlal_high_u16(vmovl_high_u16(x_clamped), + y_clamped, v_src_element_stride_); + + // Read pixels from source + uint16x8_t pixels = { + src_rows_[vgetq_lane_u32(indices_low, 0)], + src_rows_[vgetq_lane_u32(indices_low, 1)], + src_rows_[vgetq_lane_u32(indices_low, 2)], + src_rows_[vgetq_lane_u32(indices_low, 3)], + src_rows_[vgetq_lane_u32(indices_high, 0)], + src_rows_[vgetq_lane_u32(indices_high, 1)], + src_rows_[vgetq_lane_u32(indices_high, 2)], + src_rows_[vgetq_lane_u32(indices_high, 3)], + }; + + return pixels; + } + + uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, + uint16x8_t d) { + auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return vmlal_u16(vmull_u16(nfrac, left), frac, right); + }; + + auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right, + uint16x8_t frac, + uint16x8_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), + vget_low_u16(frac), vget_low_u16(nfrac)); + }; + + auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right, + uint16x8_t frac, + uint16x8_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), + vget_high_u16(frac), vget_high_u16(nfrac)); + }; + + // Offset pixel values by 0.5 before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, + uint32x4_t nfrac) -> uint32x4_t { + uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); + return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint32x4_t line0_low = interpolate_horizontal_low(a, c, xfrac_, nxfrac_); + uint32x4_t line1_low = interpolate_horizontal_low(b, d, xfrac_, nxfrac_); + uint32x4_t line0_high = interpolate_horizontal_high(a, c, xfrac_, nxfrac_); + uint32x4_t line1_high = interpolate_horizontal_high(b, d, xfrac_, nxfrac_); + + uint32x4_t lo = interpolate_vertical(line0_low, line1_low, + vmovl_u16(vget_low_u16(yfrac_)), + vmovl_u16(vget_low_u16(nyfrac_))); + uint32x4_t hi = + interpolate_vertical(line0_high, line1_high, vmovl_high_u16(yfrac_), + vmovl_high_u16(nyfrac_)); + + // Discard upper 16 bits of each element (low the precision back to original + // 16 bits) + uint16x8_t result = + vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); + return result; + } + + private: + Rows src_rows_; + uint16x8_t v_src_element_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; + uint16x8_t xfrac_; + uint16x8_t yfrac_; + uint16x8_t nxfrac_; + uint16x8_t nyfrac_; + int16x8_t x0_; + int16x8_t x1_; + int16x8_t y0_; + int16x8_t y1_; +}; // end of class RemapS16Point5Replicate + template class RemapS16Point5ConstantBorder; @@ -499,6 +671,179 @@ class RemapS16Point5ConstantBorder { uint8x8_t v_border_; }; // end of class RemapS16Point5ConstantBorder +template <> +class RemapS16Point5ConstantBorder { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5ConstantBorder(Rows src_rows, + size_t src_width, size_t src_height, + const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_element_stride_{vdupq_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdupq_n_u16(*border_value)}, + xfrac_{vdupq_n_u16(0)}, + yfrac_{vdupq_n_u16(0)}, + nxfrac_{vdupq_n_u16(0)}, + nyfrac_{vdupq_n_u16(0)}, + x0_{vdupq_n_s16(0)}, + x1_{vdupq_n_s16(0)}, + y0_{vdupq_n_s16(0)}, + y1_{vdupq_n_s16(0)} {} + + void prepare_maps(Columns mapxy, + Columns mapfrac) { + int16x8x2_t xy = vld2q_s16(&mapxy[0]); + uint16x8_t frac = vld1q_u16(&mapfrac[0]); + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); + xfrac_ = vandq_u16(frac, frac_mask); + yfrac_ = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); + nxfrac_ = vsubq_u16(frac_max, xfrac_); + nyfrac_ = vsubq_u16(frac_max, yfrac_); + + uint16x8_t one = vdupq_n_u16(1); + x0_ = xy.val[0]; + y0_ = xy.val[1]; + x1_ = vaddq_u16(x0_, one); + y1_ = vaddq_u16(y0_, one); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + prepare_maps(mapxy, mapfrac); + transform_pixels(dst); + + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + void transform_pixels(Columns dst) { + uint16x8_t a = load_pixels(x0_, y0_); + uint16x8_t b = load_pixels(x0_, y1_); + uint16x8_t c = load_pixels(x1_, y0_); + uint16x8_t d = load_pixels(x1_, y1_); + + uint16x8_t result = interpolate(a, b, c, d); + + vst1q_u16(&dst[0], result); + } + + uint16x8_t load_pixels(uint16x8_t x, uint16x8_t y) { + // Find whether coordinates are within the image dimensions. + // Negative coordinates are interpreted as large values due to the s16->u16 + // reinterpretation. + uint16x8_t in_range = + vandq_u16(vcltq_u16(vreinterpretq_u16_s16(x), v_width_), + vcltq_u16(vreinterpretq_u16_s16(y), v_height_)); + + // Zero out-of-range coordinates. + x = vandq_u16(in_range, x); + y = vandq_u16(in_range, y); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + uint32x4_t indices_low = + vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), + vget_low_u16(v_src_element_stride_)); + uint32x4_t indices_high = + vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); + + // Read pixels from source + uint16x8_t pixels = { + src_rows_[vgetq_lane_u32(indices_low, 0)], + src_rows_[vgetq_lane_u32(indices_low, 1)], + src_rows_[vgetq_lane_u32(indices_low, 2)], + src_rows_[vgetq_lane_u32(indices_low, 3)], + src_rows_[vgetq_lane_u32(indices_high, 0)], + src_rows_[vgetq_lane_u32(indices_high, 1)], + src_rows_[vgetq_lane_u32(indices_high, 2)], + src_rows_[vgetq_lane_u32(indices_high, 3)], + }; + // Select between source pixels and border colour + return vbslq_u16(in_range, pixels, v_border_); + } + + uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, + uint16x8_t d) { + auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return vmlal_u16(vmull_u16(nfrac, left), frac, right); + }; + + auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right, + uint16x8_t frac, + uint16x8_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), + vget_low_u16(frac), vget_low_u16(nfrac)); + }; + + auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right, + uint16x8_t frac, + uint16x8_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), + vget_high_u16(frac), vget_high_u16(nfrac)); + }; + + // Offset pixel values by 0.5 before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, + uint32x4_t nfrac) -> uint32x4_t { + uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); + return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint32x4_t line0_low = interpolate_horizontal_low(a, c, xfrac_, nxfrac_); + uint32x4_t line1_low = interpolate_horizontal_low(b, d, xfrac_, nxfrac_); + uint32x4_t line0_high = interpolate_horizontal_high(a, c, xfrac_, nxfrac_); + uint32x4_t line1_high = interpolate_horizontal_high(b, d, xfrac_, nxfrac_); + + uint32x4_t lo = interpolate_vertical(line0_low, line1_low, + vmovl_u16(vget_low_u16(yfrac_)), + vmovl_u16(vget_low_u16(nyfrac_))); + uint32x4_t hi = + interpolate_vertical(line0_high, line1_high, vmovl_high_u16(yfrac_), + vmovl_high_u16(nyfrac_)); + + // Discard upper 16 bits of each element (low the precision back to original + // 16 bits) + uint16x8_t result = + vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); + return result; + } + + private: + Rows src_rows_; + uint16x8_t v_src_element_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint16x8_t v_border_; + uint16x8_t xfrac_; + uint16x8_t yfrac_; + uint16x8_t nxfrac_; + uint16x8_t nyfrac_; + int16x8_t x0_; + int16x8_t x1_; + int16x8_t y0_; + int16x8_t y1_; +}; // end of class RemapS16Point5ConstantBorder + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -561,5 +906,6 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t); kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); } // namespace kleidicv::neon diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index b6311ac7f..65c5afe11 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -300,6 +300,38 @@ inline svuint16_t interpolate_16point5( 2ULL * REMAP16POINT5_FRAC_BITS); } +template <> +inline svuint16_t interpolate_16point5( + svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t nxfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); + svuint16_t nyfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); + svuint32_t line0_b = svmla_x(pg, svmullb(xfrac, src_b), svmovlb_u32(nxfrac), + svmovlb_u32(src_a)); + svuint32_t line0_t = svmla_x(pg, svmullt(xfrac, src_b), svmovlt_u32(nxfrac), + svmovlt_u32(src_a)); + svuint32_t line1_b = svmla_x(pg, svmullb(xfrac, src_d), svmovlb_u32(nxfrac), + svmovlb_u32(src_c)); + svuint32_t line1_t = svmla_x(pg, svmullt(xfrac, src_d), svmovlt_u32(nxfrac), + svmovlt_u32(src_c)); + + svuint32_t acc_b = + svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_b, svmovlb_u32(nyfrac)), + line1_b, svmovlb_u32(yfrac)); + svuint32_t acc_t = + svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_t, svmovlt_u32(nyfrac)), + line1_t, svmovlt_u32(yfrac)); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); +} + template class RemapS16Point5Replicate; @@ -419,6 +451,134 @@ class RemapS16Point5Replicate { MapVectorType& v_ymax_; }; // end of class RemapS16Point5Replicate +template <> +class RemapS16Point5Replicate { + public: + using ScalarType = uint16_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5Replicate(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + svuint16_t src_a, src_b, src_c, src_d; + + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + auto vector_path = [&](svbool_t pg, ptrdiff_t step) { + load_source(pg, step, mapxy, src_a, src_b, src_c, src_d); + interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d, + bias); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = MapVecTraits::svptrue(); + vector_path(pg, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + vector_path(pg, static_cast(length)); + }); + } + + protected: + svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t, + svuint32_t offsets_t) { + // Account for the size of the source type when calculating offset + offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1); + offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1); + + svuint32_t src_b = + svldnt1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t src_t = + svldnt1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + return svtrn1_u16(svreinterpret_u16_u32(src_b), + svreinterpret_u16_u32(src_t)); + } + + void load_source(svbool_t pg, ptrdiff_t step, Columns& mapxy, + svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c, + svuint16_t& src_d) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + + // Clamp coordinates to within the dimensions of the source image + svuint16_t x0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint16_t x1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); + + svuint16_t y1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); + svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); + svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + svuint32_t offsets_a_b = + svmlalb_u32(svmovlb_u32(x0), y0, v_src_element_stride_); + svuint32_t offsets_a_t = + svmlalt_u32(svmovlt_u32(x0), y0, v_src_element_stride_); + svuint32_t offsets_b_b = + svmlalb_u32(svmovlb_u32(x1), y0, v_src_element_stride_); + svuint32_t offsets_b_t = + svmlalt_u32(svmovlt_u32(x1), y0, v_src_element_stride_); + svuint32_t offsets_c_b = + svmlalb_u32(svmovlb_u32(x0), y1, v_src_element_stride_); + svuint32_t offsets_c_t = + svmlalt_u32(svmovlt_u32(x0), y1, v_src_element_stride_); + svuint32_t offsets_d_b = + svmlalb_u32(svmovlb_u32(x1), y1, v_src_element_stride_); + svuint32_t offsets_d_t = + svmlalt_u32(svmovlt_u32(x1), y1, v_src_element_stride_); + + // Load pixels from source + src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t); + src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t); + src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t); + src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t); + mapxy += step; + } + + void interpolate_and_store(svbool_t pg, ptrdiff_t step, + Columns& mapfrac, + Columns& dst, svuint16_t src_a, + svuint16_t src_b, svuint16_t src_c, + svuint16_t src_d, svuint32_t bias) { + FracVectorType frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t result = interpolate_16point5(pg, frac, src_a, src_b, + src_c, src_d, bias); + svst1_u16(pg, &dst[0], result); + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_element_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Point5Replicate + template class RemapS16Point5ConstantBorder; @@ -513,6 +673,107 @@ class RemapS16Point5ConstantBorder { svuint16_t& v_border_; }; // end of class RemapS16Point5ConstantBorder +template <> +class RemapS16Point5ConstantBorder { + public: + using ScalarType = uint16_t; + + RemapS16Point5ConstantBorder(Rows src_rows, + size_t src_width, size_t src_height, + const ScalarType* border_value, + svuint16_t& v_src_stride, svuint16_t& v_width, + svuint16_t& v_height, svuint16_t& v_border) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_stride}, + v_width_{v_width}, + v_height_{v_height}, + v_border_{v_border} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_width_ = svdup_u16(static_cast(src_width)); + v_height_ = svdup_u16(static_cast(src_height)); + v_border_ = svdup_u16(*border_value); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + svuint16_t one = svdup_n_u16(1); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + for (size_t i = 0; i < width; i += svcnth()) { + svbool_t pg = svwhilelt_b16(i, width); + + svuint16x2_t xy = + svld2_u16(pg, reinterpret_cast( + &mapxy[static_cast(i * 2)])); + + svuint16_t x0 = svget2(xy, 0); + svuint16_t y0 = svget2(xy, 1); + svuint16_t x1 = svadd_x(pg, x0, one); + svuint16_t y1 = svadd_x(pg, y0, one); + + svuint16_t v00 = load_pixels_or_constant_border( + src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, + x0, y0); + svuint16_t v01 = load_pixels_or_constant_border( + src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, + x0, y1); + svuint16_t v10 = load_pixels_or_constant_border( + src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, + x1, y0); + svuint16_t v11 = load_pixels_or_constant_border( + src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, + x1, y1); + + svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast(i)]); + svuint16_t result = + interpolate_16point5(pg, frac, v00, v10, v01, v11, bias); + + svst1_u16(pg, &dst[static_cast(i)], result); + } + } + + private: + svuint16_t load_pixels_or_constant_border(Rows src_rows_, + svuint16_t& v_src_element_stride_, + svuint16_t& v_width_, + svuint16_t& v_height_, + svuint16_t& v_border_, svbool_t pg, + svuint16_t x, svuint16_t y) { + // Find whether coordinates are within the image dimensions. + svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), + svcmplt_u16(pg, y, v_height_)); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + svuint32_t offsets_b = + svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); + svuint32_t offsets_t = + svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); + + svbool_t pg_b = in_range; + svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); + + // Account for the size of the source type when calculating offset + offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1); + offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1); + + // Copy pixels from source + svuint32_t result_b = + svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + return svsel(in_range, result, v_border_); + } + + Rows src_rows_; + svuint16_t& v_src_element_stride_; + svuint16_t& v_width_; + svuint16_t& v_height_; + svuint16_t& v_border_; +}; // end of class RemapS16Point5ConstantBorder + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp index ff5b80583..0777091d7 100644 --- a/kleidicv/src/transform/remap_sve2.cpp +++ b/kleidicv/src/transform/remap_sve2.cpp @@ -27,10 +27,10 @@ kleidicv_error_t remap_s16point5(const T *src, size_t src_stride, const uint16_t *mapfrac, size_t mapfrac_stride, kleidicv_border_type_t border_type, const T *border_value) { - return remap_s16point5_sc(src, src_stride, src_width, src_height, - dst, dst_stride, dst_width, dst_height, - channels, mapxy, mapxy_stride, mapfrac, - mapfrac_stride, border_type, border_value); + return remap_s16point5_sc(src, src_stride, src_width, src_height, dst, + dst_stride, dst_width, dst_height, channels, + mapxy, mapxy_stride, mapfrac, mapfrac_stride, + border_type, border_value); } #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ @@ -52,5 +52,6 @@ KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t); kleidicv_border_type_t border_type, const type *border_value) KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); } // namespace kleidicv::sve2 diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 7ae5628f4..52eb762e6 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -416,6 +416,18 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u8( kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_remap_s16point5_u16 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_remap_s16point5_u16( + const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const int16_t *mapxy, size_t mapxy_stride, + const uint16_t *mapfrac, size_t mapfrac_stride, + kleidicv_border_type_t border_type, const uint16_t *border_value, + kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_warp_perspective_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index b2f37c7d2..466994915 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -721,6 +721,29 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u8( return parallel_batches(callback, mt, dst_height); } +kleidicv_error_t kleidicv_thread_remap_s16point5_u16( + const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const int16_t *mapxy, size_t mapxy_stride, + const uint16_t *mapfrac, size_t mapfrac_stride, + kleidicv_border_type_t border_type, const uint16_t *border_value, + kleidicv_thread_multithreading mt) { + if (!kleidicv::remap_s16point5_is_implemented( + src_stride, src_width, src_height, dst_width, border_type, + channels)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_remap_s16point5_u16( + src, src_stride, src_width, src_height, + dst + begin * dst_stride / sizeof(uint16_t), dst_stride, dst_width, + end - begin, channels, mapxy + begin * mapxy_stride / sizeof(int16_t), + mapxy_stride, mapfrac + begin * mapfrac_stride / sizeof(uint16_t), + mapfrac_stride, border_type, border_value); + }; + return parallel_batches(callback, mt, dst_height); +} + kleidicv_error_t kleidicv_thread_warp_perspective_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 0047c7a7c..5e8ec8297 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -81,9 +81,11 @@ InRange_F32: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 32FC1, 1, 2)' Remap_S16_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' +Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' +Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 1)' WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 1)' WarpPerspective_Nearest_Constant: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_CONSTANT, 1)' diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 6fe9ad298..8dd360127 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -16,6 +16,12 @@ KLEIDICV_REMAP_S16(uint8_t, u8); KLEIDICV_REMAP_S16(uint16_t, u16); +#define KLEIDICV_REMAP_S16POINT5(type, type_suffix) \ + KLEIDICV_API(remap_s16point5, kleidicv_remap_s16point5_##type_suffix, type) + +KLEIDICV_REMAP_S16POINT5(uint8_t, u8); +KLEIDICV_REMAP_S16POINT5(uint16_t, u16); + template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, @@ -299,24 +305,35 @@ TYPED_TEST(RemapS16, InvalidImageSize) { TypeParam dst[8]; int16_t mapxy[16] = {}; + // Source too wide EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_s16()(src, 1 * sizeof(TypeParam), std::numeric_limits::max() + 2, 1, dst, 8, 8, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Source too high EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_s16()(src, 1 * sizeof(TypeParam), 1, std::numeric_limits::max() + 2, dst, 8, 8, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Source extremely wide EXPECT_EQ(KLEIDICV_ERROR_RANGE, remap_s16()( src, (KLEIDICV_MAX_IMAGE_PIXELS + 1) * sizeof(TypeParam), KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Source extremely high + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + remap_s16()(src, sizeof(TypeParam), 1, + KLEIDICV_MAX_IMAGE_PIXELS + 1, dst, + 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + + // Source too large EXPECT_EQ(KLEIDICV_ERROR_RANGE, remap_s16()( src, KLEIDICV_MAX_IMAGE_PIXELS * sizeof(TypeParam), @@ -324,6 +341,7 @@ TYPED_TEST(RemapS16, InvalidImageSize) { 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Destination too wide EXPECT_EQ(KLEIDICV_ERROR_RANGE, remap_s16()( src, 1 * sizeof(TypeParam), 1, 1, dst, @@ -331,6 +349,7 @@ TYPED_TEST(RemapS16, InvalidImageSize) { KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Destination too large EXPECT_EQ(KLEIDICV_ERROR_RANGE, remap_s16()( src, 1 * sizeof(TypeParam), 1, 1, dst, @@ -347,7 +366,7 @@ TYPED_TEST(RemapS16, UnsupportedBigStride) { EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_s16()( - src, (std::numeric_limits::max() + 1) * sizeof(TypeParam), + src, (std::numeric_limits::max() + 1L) * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); @@ -387,7 +406,7 @@ TYPED_TEST(RemapS16, UnsupportedTooSmallImage) { EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, remap_s16()(src, 1 * sizeof(TypeParam), 1, 1, dst, - 7 * sizeof(TypeParam), 7, 1, 1, mapxy, 4, + 8 * sizeof(TypeParam), 7, 1, 1, mapxy, 4, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } @@ -510,7 +529,7 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, kleidicv_remap_s16point5_u8( + ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), channels, @@ -540,7 +559,7 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, kleidicv_remap_s16point5_u8( + ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), channels, @@ -594,7 +613,7 @@ class RemapS16Point5 : public testing::Test { } }; -using RemapS16Point5ElementTypes = ::testing::Types; +using RemapS16Point5ElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapS16Point5, RemapS16Point5ElementTypes); TYPED_TEST(RemapS16Point5, RandomNoPadding) { @@ -602,9 +621,11 @@ TYPED_TEST(RemapS16Point5, RandomNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; + size_t channels = 1; + size_t padding = 0; for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_random(src_w, src_h, dst_w, dst_h, 1, border_type, - border_value, 0); + TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); } } @@ -613,9 +634,11 @@ TYPED_TEST(RemapS16Point5, BlendPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; + size_t channels = 1; + size_t padding = 13; for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, 1, border_type, - border_value, 13); + TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, + border_value, padding); } } @@ -624,9 +647,11 @@ TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; + size_t channels = 1; + size_t padding = 13; for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, 1, border_type, - border_value, 13); + TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); } } @@ -648,9 +673,11 @@ TYPED_TEST(RemapS16Point5, CornerCases) { size_t src_h = std::numeric_limits::max() + 1; size_t dst_w = 3 * test::Options::vector_lanes() - 1; size_t dst_h = 4; + size_t channels = 1; + size_t padding = 17; for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, 1, border_type, - border_value, 17); + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); } } @@ -659,9 +686,9 @@ TYPED_TEST(RemapS16Point5, NullPointer) { TypeParam dst[1]; int16_t mapxy[2] = {}; uint16_t mapfrac[1] = {}; - test::test_null_args(kleidicv_remap_s16point5_u8, src, 2, 2, 2, dst, 1, 1, 1, - 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_CONSTANT, - src); + test::test_null_args(remap_s16point5(), src, 2 * sizeof(TypeParam), + 2, 2, dst, 1 * sizeof(TypeParam), 1, 1, 1, mapxy, 4, + mapfrac, 2, KLEIDICV_BORDER_TYPE_CONSTANT, src); } TYPED_TEST(RemapS16Point5, ZeroImageSize) { @@ -671,13 +698,14 @@ TYPED_TEST(RemapS16Point5, ZeroImageSize) { uint16_t mapfrac[1] = {}; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8( - src, 1, 0, 1, dst, 1, 0, 1, 1, mapxy, 4, mapfrac, 2, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8( - src, 1, 1, 0, dst, 1, 1, 0, 1, mapxy, 4, mapfrac, 2, + remap_s16point5()( + src, 0, 0, 1, dst, 0, 0, 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1 * sizeof(TypeParam), 1, 0, dst, 1 * sizeof(TypeParam), 1, 0, 1, + mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapS16Point5, InvalidImageSize) { @@ -686,38 +714,78 @@ TYPED_TEST(RemapS16Point5, InvalidImageSize) { int16_t mapxy[2] = {}; uint16_t mapfrac[1] = {}; + const size_t kTooBig = std::numeric_limits::max() + 2L; + // Source too wide + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, kTooBig * sizeof(TypeParam), kTooBig, 1, dst, + 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + + // Source too high EXPECT_EQ( KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8( - src, 1, std::numeric_limits::max() + 2, 1, dst, 1, 1, 1, 1, - mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_s16point5()( + src, sizeof(TypeParam), 1, kTooBig, dst, 8 * sizeof(TypeParam), 8, 1, + 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Source extremely wide EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8( - src, 1, 1, std::numeric_limits::max() + 2, dst, 1, 1, 1, 1, + KLEIDICV_ERROR_RANGE, + remap_s16point5()( + src, (KLEIDICV_MAX_IMAGE_PIXELS + 1) * sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Source extremely high EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_s16point5_u8( - src, 1, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, dst, 1, 1, 1, 1, - mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_s16point5()( + src, sizeof(TypeParam), 1, KLEIDICV_MAX_IMAGE_PIXELS + 1, dst, + 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Source too large EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_s16point5_u8( - src, 1, KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, - dst, 1, 1, 1, 1, mapxy, 4, mapfrac, 2, + remap_s16point5()( + src, KLEIDICV_MAX_IMAGE_PIXELS * sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, dst, + 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Destination too wide EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_s16point5_u8( - src, 1, 1, 1, dst, 1, KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, - mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + remap_s16point5()( + src, sizeof(TypeParam), 1, 1, dst, + (KLEIDICV_MAX_IMAGE_PIXELS + 1) * sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 1, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + // Destination too large EXPECT_EQ(KLEIDICV_ERROR_RANGE, - kleidicv_remap_s16point5_u8( - src, 1, 1, 1, dst, 1, KLEIDICV_MAX_IMAGE_PIXELS, - KLEIDICV_MAX_IMAGE_PIXELS, 1, mapxy, 4, mapfrac, 2, + remap_s16point5()( + src, 1 * sizeof(TypeParam), 1, 1, dst, + (KLEIDICV_MAX_IMAGE_PIXELS + 1) * sizeof(TypeParam), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, 1, mapxy, + 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); +} + +TYPED_TEST(RemapS16Point5, UnsupportedBigStride) { + const TypeParam src[1] = {}; + TypeParam dst[8]; + int16_t mapxy[16] = {}; + uint16_t mapfrac[8] = {}; + + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, (std::numeric_limits::max() + 1L) * sizeof(TypeParam), + 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + + EXPECT_EQ(KLEIDICV_OK, + remap_s16point5()( + src, std::numeric_limits::max() * sizeof(TypeParam), + 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } @@ -727,10 +795,11 @@ TYPED_TEST(RemapS16Point5, UnsupportedTwoChannels) { int16_t mapxy[16] = {}; uint16_t mapfrac[8] = {}; - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8( - src, 1, 1, 1, dst, 8, 8, 1, 2, mapxy, 4, mapfrac, 2, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 2, + mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { @@ -740,9 +809,9 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { uint16_t mapfrac[8] = {}; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8(src, 1, 1, 1, dst, 8, 8, 1, 1, mapxy, 4, - mapfrac, 2, - KLEIDICV_BORDER_TYPE_REFLECT, src)); + remap_s16point5()( + src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, + 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REFLECT, src)); } TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) { @@ -751,8 +820,9 @@ TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) { int16_t mapxy[16] = {}; uint16_t mapfrac[8] = {}; - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, - kleidicv_remap_s16point5_u8( - src, 1, 1, 1, dst, 8, 7, 1, 1, mapxy, 4, mapfrac, 2, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 7, 1, 1, + mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 6556f78ad..3861d8028 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -609,24 +609,37 @@ TEST_P(Thread, remap_s16_u16_border_replicate) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16_u8_border_constant) { + const uint8_t border_value = 0; + check_remap_s16(kleidicv_remap_s16_u8, kleidicv_thread_remap_s16_u8, + 1, KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + +TEST_P(Thread, remap_s16_u16_border_constant) { + const uint16_t border_value = 0; + check_remap_s16(kleidicv_remap_s16_u16, + kleidicv_thread_remap_s16_u16, 1, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + TEST_P(Thread, remap_s16_u8_not_implemented) { - const uint8_t border_value[4] = {}; + const uint8_t border_value = 0; check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u8, 2, KLEIDICV_BORDER_TYPE_REPLICATE, - border_value); + &border_value); check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u8, 1, KLEIDICV_BORDER_TYPE_REFLECT, - border_value); + &border_value); } TEST_P(Thread, remap_s16_u16_not_implemented) { - const uint16_t border_value[4] = {}; + const uint16_t border_value = 0; check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u16, 2, KLEIDICV_BORDER_TYPE_REPLICATE, - border_value); + &border_value); check_remap_s16_not_implemented(kleidicv_thread_remap_s16_u16, 1, KLEIDICV_BORDER_TYPE_REFLECT, - border_value); + &border_value); } TEST_P(Thread, remap_s16point5_u8_border_replicate) { @@ -635,30 +648,60 @@ TEST_P(Thread, remap_s16point5_u8_border_replicate) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u16_border_replicate) { + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + +TEST_P(Thread, remap_s16point5_u8_border_constant) { + const uint8_t border_value = 0; + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 1, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + +TEST_P(Thread, remap_s16point5_u16_border_constant) { + const uint16_t border_value = 0; + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 1, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + TEST_P(Thread, remap_s16point5_u8_not_implemented) { - const uint8_t border_value[4] = {}; + const uint8_t border_value = 0; check_remap_s16point5_not_implemented( kleidicv_thread_remap_s16point5_u8, 2, KLEIDICV_BORDER_TYPE_REPLICATE, - border_value); + &border_value); check_remap_s16point5_not_implemented( kleidicv_thread_remap_s16point5_u8, 1, KLEIDICV_BORDER_TYPE_REFLECT, - border_value); + &border_value); +} + +TEST_P(Thread, remap_s16point5_u16_not_implemented) { + const uint16_t border_value = 0; + check_remap_s16point5_not_implemented( + kleidicv_thread_remap_s16point5_u16, 2, KLEIDICV_BORDER_TYPE_REPLICATE, + &border_value); + check_remap_s16point5_not_implemented( + kleidicv_thread_remap_s16point5_u16, 1, KLEIDICV_BORDER_TYPE_REFLECT, + &border_value); } TEST_P(Thread, warp_perspective_u8_border_replicate) { - const uint8_t border_value[4] = {}; - check_warp_perspective(kleidicv_warp_perspective_u8, - kleidicv_thread_warp_perspective_u8, 1, - KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, border_value); + const uint8_t border_value = 0; + check_warp_perspective( + kleidicv_warp_perspective_u8, kleidicv_thread_warp_perspective_u8, 1, + KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, + &border_value); } TEST_P(Thread, warp_perspective_u8_linear_border_replicate) { - const uint8_t border_value[4] = {}; - check_warp_perspective(kleidicv_warp_perspective_u8, - kleidicv_thread_warp_perspective_u8, 1, - KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, border_value); + const uint8_t border_value = 0; + check_warp_perspective( + kleidicv_warp_perspective_u8, kleidicv_thread_warp_perspective_u8, 1, + KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, + &border_value); } TEST_P(Thread, warp_perspective_u8_not_implemented) { -- GitLab