From 8aaafef0f9f48381f9df701baa9e5359189c3f58 Mon Sep 17 00:00:00 2001 From: Richard Wells Date: Tue, 18 Feb 2025 15:06:45 +0000 Subject: [PATCH] Implement 4-channel Remap16Point5 replicate border. --- CHANGELOG.md | 13 +- adapters/opencv/kleidicv_hal.cpp | 16 +- conformity/opencv/test_remap.cpp | 31 +- doc/functionality.md | 14 +- doc/opencv.md | 4 +- kleidicv/include/kleidicv/kleidicv.h | 4 +- kleidicv/include/kleidicv/transform/remap.h | 6 +- .../src/transform/remap_s16point5_neon.cpp | 576 +++++++++++++----- .../src/transform/remap_s16point5_sve2.cpp | 386 +++++++++++- scripts/benchmark/benchmarks.txt | 2 + test/api/test_remap.cpp | 251 +++++--- test/api/test_thread.cpp | 12 + 12 files changed, 1031 insertions(+), 284 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffaf5232d..828d90092 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,13 +17,14 @@ This changelog aims to follow the guiding principles of ### Added - Implementation of Rotate 90 degrees clockwise. -- Remap implementations with - - Integer coordinates with nearest neighbour method +- Remap implementations for u8 and u16 images + - Integer coordinates with nearest neighbour method (1 channel only) + - Replicated and constant borders - Fixed-point coordinates with linear interpolation - - Floating-point coordinates with nearest neighbour and linear interpolation - - Replicated and constant borders - - 1 channel source image for integer and fixed-points coordinates, 1 and 2 channels for floating-point coordinates - - u8 and u16 images + - 1 channel with Replicated and constant borders + - 4 channels with Replicated borders only + - Floating-point coordinates with nearest neighbour and linear interpolation (1 and 2 channels) + - Replicated and constant borders - WarpPerspective implementation - Nearest and Linear interpolation method, for 1-channel u8 input. diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 8a897d1e4..8525450a4 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1354,24 +1354,24 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, auto mt = get_multithreading(); - if (src_type == CV_8UC1) { + if (CV_MAT_DEPTH(src_type) == CV_8U) { auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_s16point5_u8( reinterpret_cast(src_data), src_step, static_cast(src_width), static_cast(src_height), reinterpret_cast(dst_data), dst_step, - static_cast(dst_width), static_cast(dst_height), 1, - mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, - border_value.data(), mt)); - } else if (src_type == CV_16UC1) { + static_cast(dst_width), static_cast(dst_height), + CV_MAT_CN(src_type), mapxy, mapxy_step, mapfrac, mapfrac_step, + kleidicv_border_type, border_value.data(), mt)); + } else if (CV_MAT_DEPTH(src_type) == CV_16U) { auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_s16point5_u16( reinterpret_cast(src_data), src_step, static_cast(src_width), static_cast(src_height), reinterpret_cast(dst_data), dst_step, - static_cast(dst_width), static_cast(dst_height), 1, - mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, - border_value.data(), mt)); + static_cast(dst_width), static_cast(dst_height), + CV_MAT_CN(src_type), mapxy, mapxy_step, mapfrac, mapfrac_step, + kleidicv_border_type, border_value.data(), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index f80e1ab3d..566caf88f 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -12,8 +12,8 @@ const int kMaxHeight = 36, kMaxWidth = 32; -template -static cv::Mat get_source_mat(int format) { +template +cv::Mat get_source_mat(int format) { auto generate_source = [&]() { cv::Mat m(kMaxHeight, kMaxWidth, format); const int64_t kMaxValue = std::numeric_limits::max(); @@ -21,9 +21,13 @@ static cv::Mat get_source_mat(int format) { for (size_t column = 0; column < kMaxWidth; ++column) { // Create as many different differences between neighbouring pixels as // possible - size_t counter = row + column; - m.at(row, column) = - (counter % 2) ? kMaxValue : (counter % (kMaxValue + 1)); + cv::Vec pixel_value; + for (size_t ch = 0; ch < Channels; ++ch) { + size_t counter = row + column + ch; + pixel_value[ch] = + (counter % 2) ? kMaxValue : (counter % (kMaxValue + 1)); + } + m.at>(row, column) = pixel_value; } } return m; @@ -36,7 +40,7 @@ static cv::Mat get_source_mat(int format) { template cv::Mat exec_remap_s16(cv::Mat& mapxy_mat) { - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = get_source_mat(Format); cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format); cv::Mat empty; remap(source_mat, result, mapxy_mat, empty, Interpolation, BorderMode, @@ -49,12 +53,13 @@ template bool test_remap_s16(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = get_source_mat(Format); cv::RNG rng(0); for (size_t w = 5; w <= kMaxWidth; w += 3) { for (size_t h = 5; h <= kMaxHeight; h += 2) { - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = + get_source_mat(Format); cv::Mat mapxy_mat(w, h, CV_16SC2); rng.fill(mapxy_mat, cv::RNG::UNIFORM, -3, kMaxWidth + 3); @@ -89,7 +94,7 @@ cv::Mat exec_remap_s16point5(cv::Mat& map_mat) { ushort* p_frac = map_mat.rowRange(height, map_mat.rows).ptr(); cv::Mat mapfrac_mat{height, map_mat.cols, CV_16UC1, p_frac}; cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format); - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = get_source_mat(Format); remap(source_mat, result, mapxy_mat, mapfrac_mat, Interpolation, BorderMode, BorderValue / 1000.0); return result; @@ -100,7 +105,7 @@ template bool test_remap_s16point5(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = get_source_mat(Format); cv::RNG rng(0); for (int w = 5; w <= kMaxWidth; ++w) { @@ -138,7 +143,7 @@ bool test_remap_s16point5(int index, RecreatedMessageQueue& request_queue, template cv::Mat exec_remap_f32(cv::Mat& mapxy_mat) { - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = get_source_mat(Format); cv::Mat result(mapxy_mat.rows, mapxy_mat.cols, Format); cv::Mat mapx_mat = mapxy_mat.rowRange(0, mapxy_mat.rows / 2); @@ -154,7 +159,7 @@ template bool test_remap_f32(int index, RecreatedMessageQueue& request_queue, RecreatedMessageQueue& reply_queue) { - cv::Mat source_mat = get_source_mat(Format); + cv::Mat source_mat = get_source_mat(Format); cv::RNG rng(0); for (size_t w = 5; w <= kMaxWidth * 2; w += 3) { @@ -250,7 +255,9 @@ std::vector& remap_tests_get() { TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint8 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), diff --git a/doc/functionality.md b/doc/functionality.md index 3d178946f..62bc84170 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -93,12 +93,14 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | 8x8 | | x | # Remap -| | 1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 | -|---------------------------------------------------|---------|---------|--------|---------| -| Remap int16 coordinates | x | x | | | -| Remap int16+uint16 fixed-point coordinates | x | x | | | -| Remap float32 coordinates - nearest interpolation | x | x | x | x | -| Remap float32 coordinates - linear interpolation | x | x | x | x | +| | 1ch u8 | 1ch u16 | 2ch u8 | 2ch u16 | 4ch u8 | 4ch u16 | +|---------------------------------------------------|---------|---------|--------|---------|--------|---------| +| Remap int16 coordinates | x | x | | | | | +| Remap int16+uint16 fixed-point coordinates | x | x | | | R | R | +| Remap float32 coordinates - nearest interpolation | x | x | x | x | | | +| Remap float32 coordinates - linear interpolation | x | x | x | x | | | + +R = Replicated borders only # WarpPerspective | | u8 | diff --git a/doc/opencv.md b/doc/opencv.md index dfc154e09..f70aab86f 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -208,7 +208,9 @@ Geometrically transforms the `src` image by taking the pixels specified by the c Notes on parameters: * `src.step` - must be less than 65536 * element size. * `src.width`, `src_height` - must not be greater than 32768. -* `src.type()` - supports `CV_8UC1` and `CV_16UC1`. With `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well. +* `src.type()` - supports `CV_8UC1` and `CV_16UC1` with all map configs + * additionally, with `CV_32FC1` map config, it supports `CV_8UC2` and `CV_16UC2` as well. + * additionally, with `CV_16SC2` plus `CV_16UC1` map config and `BORDER_REPLICATE`, it supports `CV_8UC4` and `CV_16UC4` * `dst.cols` - must be at least 4 (32FC1-type maps) or 8 (16SC2-type maps) * `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT`. diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 6a5ff2658..2c4e1dff5 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1786,7 +1786,9 @@ KLEIDICV_API_DECLARATION(kleidicv_in_range_f32, const float *src, /// start of the next row for the destination data. /// Must be a multiple of `sizeof(int16_t)` and no less than /// `width * sizeof(int16_t)`, except for single-row images. -/// @param channels Number of channels in the data. Must be 1. +/// @param channels Number of channels in the data: \n +/// - Must be 1 for constant border. +// - Must be 1 or 4 for replicate border. /// @param border_type Way of handling the border. The supported border types /// are: \n /// - @ref KLEIDICV_BORDER_TYPE_CONSTANT diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index ea72f5666..6f807c4b7 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -38,13 +38,15 @@ inline bool remap_s16point5_is_implemented( size_t channels) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value || std::is_same::value) { - return (src_stride / sizeof(T) <= std::numeric_limits::max() && + return (src_stride / sizeof(T) <= + (std::numeric_limits::max() / channels) && dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && - channels == 1); + (channels == 1 || + (channels == 4 && border_type == KLEIDICV_BORDER_TYPE_REPLICATE))); } else { return false; } diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp index 27893a0a3..32562c519 100644 --- a/kleidicv/src/transform/remap_s16point5_neon.cpp +++ b/kleidicv/src/transform/remap_s16point5_neon.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include -#include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" #include "kleidicv/transform/remap.h" @@ -138,6 +136,61 @@ class RemapS16Point5Replicate { int16x8_t v_ymax_; }; // end of class RemapS16Point5Replicate +// Common interpolation function used by all RemapS16Point5 operations except +// 1-channel u8 with replicated borders (RemapS16Point5Replicate) +// because that processes one half vector in one step +static uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, + uint16x8_t d, uint16x8_t xfrac, uint16x8_t yfrac, + uint16x8_t nxfrac, uint16x8_t nyfrac) { + auto interpolate_horizontal = [](uint16x4_t left, uint16x4_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return vmlal_u16(vmull_u16(nfrac, left), frac, right); + }; + + auto interpolate_horizontal_low = [interpolate_horizontal]( + uint16x8_t left, uint16x8_t right, + uint16x8_t frac, + uint16x8_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), + vget_low_u16(frac), vget_low_u16(nfrac)); + }; + + auto interpolate_horizontal_high = [interpolate_horizontal]( + uint16x8_t left, uint16x8_t right, + uint16x8_t frac, + uint16x8_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), + vget_high_u16(frac), vget_high_u16(nfrac)); + }; + + // Offset pixel values by 0.5 before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, + uint32x4_t nfrac) -> uint32x4_t { + uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); + return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint32x4_t line0_low = interpolate_horizontal_low(a, b, xfrac, nxfrac); + uint32x4_t line1_low = interpolate_horizontal_low(c, d, xfrac, nxfrac); + uint32x4_t line0_high = interpolate_horizontal_high(a, b, xfrac, nxfrac); + uint32x4_t line1_high = interpolate_horizontal_high(c, d, xfrac, nxfrac); + + uint32x4_t lo = + interpolate_vertical(line0_low, line1_low, vmovl_u16(vget_low_u16(yfrac)), + vmovl_u16(vget_low_u16(nyfrac))); + uint32x4_t hi = interpolate_vertical( + line0_high, line1_high, vmovl_high_u16(yfrac), vmovl_high_u16(nyfrac)); + + // Discard upper 16 bits of each element (low the precision back to original + // 16 bits) + uint16x8_t result = + vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); + return result; +} + template <> class RemapS16Point5Replicate { public: @@ -207,11 +260,12 @@ class RemapS16Point5Replicate { void transform_pixels(Columns dst) { uint16x8_t a = load_pixels(x0_, y0_); - uint16x8_t b = load_pixels(x0_, y1_); - uint16x8_t c = load_pixels(x1_, y0_); + uint16x8_t b = load_pixels(x1_, y0_); + uint16x8_t c = load_pixels(x0_, y1_); uint16x8_t d = load_pixels(x1_, y1_); - uint16x8_t result = interpolate(a, b, c, d); + uint16x8_t result = + interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_); vst1q_u16(&dst[0], result); } @@ -245,56 +299,6 @@ class RemapS16Point5Replicate { return pixels; } - uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, - uint16x8_t d) { - auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right, - uint16x4_t frac, - uint16x4_t nfrac) -> uint32x4_t { - return vmlal_u16(vmull_u16(nfrac, left), frac, right); - }; - - auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right, - uint16x8_t frac, - uint16x8_t nfrac) -> uint32x4_t { - return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), - vget_low_u16(frac), vget_low_u16(nfrac)); - }; - - auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right, - uint16x8_t frac, - uint16x8_t nfrac) -> uint32x4_t { - return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), - vget_high_u16(frac), vget_high_u16(nfrac)); - }; - - // Offset pixel values by 0.5 before rounding down. - const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); - - auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, - uint32x4_t nfrac) -> uint32x4_t { - uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); - return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); - }; - - uint32x4_t line0_low = interpolate_horizontal_low(a, c, xfrac_, nxfrac_); - uint32x4_t line1_low = interpolate_horizontal_low(b, d, xfrac_, nxfrac_); - uint32x4_t line0_high = interpolate_horizontal_high(a, c, xfrac_, nxfrac_); - uint32x4_t line1_high = interpolate_horizontal_high(b, d, xfrac_, nxfrac_); - - uint32x4_t lo = interpolate_vertical(line0_low, line1_low, - vmovl_u16(vget_low_u16(yfrac_)), - vmovl_u16(vget_low_u16(nyfrac_))); - uint32x4_t hi = - interpolate_vertical(line0_high, line1_high, vmovl_high_u16(yfrac_), - vmovl_high_u16(nyfrac_)); - - // Discard upper 16 bits of each element (low the precision back to original - // 16 bits) - uint16x8_t result = - vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); - return result; - } - private: Rows src_rows_; uint16x8_t v_src_element_stride_; @@ -326,20 +330,20 @@ class RemapS16Point5ConstantBorder { v_src_stride_{vdupq_n_u16(static_cast(src_rows_.stride()))}, v_width_{vdupq_n_u16(static_cast(src_width))}, v_height_{vdupq_n_u16(static_cast(src_height))}, - v_border_{vdup_n_u8(*border_value)} {} + v_border_{vdupq_n_u16(static_cast(*border_value))} {} void process_row(size_t width, Columns mapxy, Columns mapfrac, Columns dst) { auto vector_path = [&](size_t step) { int16x8x2_t xy = vld2q_s16(&mapxy[0]); uint16x8_t frac = vld1q_u16(&mapfrac[0]); - uint8x8_t frac_max = vdup_n_u8(REMAP16POINT5_FRAC_MAX); - uint8x8_t frac_mask = vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1); - uint8x8_t xfrac = vand_u8(vmovn_u16(frac), frac_mask); - uint8x8_t yfrac = - vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); - uint8x8_t nxfrac = vsub_u8(frac_max, xfrac); - uint8x8_t nyfrac = vsub_u8(frac_max, yfrac); + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); + uint16x8_t xfrac = vandq_u16(frac, frac_mask); + uint16x8_t yfrac = + vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); + uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac); + uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac); uint16x8_t one = vdupq_n_u16(1); uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]); @@ -347,19 +351,18 @@ class RemapS16Point5ConstantBorder { uint16x8_t x1 = vaddq_u16(x0, one); uint16x8_t y1 = vaddq_u16(y0, one); - uint8x8_t v00 = load_pixels_or_constant_border( + uint16x8_t a = load_pixels_or_constant_border( src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y0); - uint8x8_t v01 = load_pixels_or_constant_border( - src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1); - uint8x8_t v10 = load_pixels_or_constant_border( + uint16x8_t b = load_pixels_or_constant_border( src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y0); - uint8x8_t v11 = load_pixels_or_constant_border( + uint16x8_t c = load_pixels_or_constant_border( + src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x0, y1); + uint16x8_t d = load_pixels_or_constant_border( src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1); - uint8x8_t result = interpolate(v00, v01, v10, v11, xfrac, vmovl_u8(yfrac), - nxfrac, vmovl_u8(nyfrac)); + uint16x8_t result = interpolate(a, b, c, d, xfrac, yfrac, nxfrac, nyfrac); - vst1_u8(&dst[0], result); + vst1_u8(&dst[0], vqmovn_u16(result)); mapxy += ptrdiff_t(step); mapfrac += ptrdiff_t(step); dst += ptrdiff_t(step); @@ -375,12 +378,12 @@ class RemapS16Point5ConstantBorder { } private: - uint8x8_t load_pixels_or_constant_border(Rows &src_rows_, - uint16x8_t v_src_element_stride_, - uint16x8_t v_width_, - uint16x8_t v_height_, - uint8x8_t v_border_, uint16x8_t x, - uint16x8_t y) { + uint16x8_t load_pixels_or_constant_border(Rows &src_rows_, + uint16x8_t v_src_element_stride_, + uint16x8_t v_width_, + uint16x8_t v_height_, + uint16x8_t v_border_, uint16x8_t x, + uint16x8_t y) { // Find whether coordinates are within the image dimensions. // Negative coordinates are interpreted as large values due to the s16->u16 // reinterpretation. @@ -411,45 +414,14 @@ class RemapS16Point5ConstantBorder { src_rows_[vgetq_lane_u32(indices_high, 3)], }; // Select between source pixels and border colour - return vbsl_u8(vmovn_u16(in_range), pixels, v_border_); - } - - uint8x8_t interpolate(uint8x8_t v00, uint8x8_t v01, uint8x8_t v10, - uint8x8_t v11, uint8x8_t xfrac, uint16x8_t yfrac, - uint8x8_t nxfrac, uint16x8_t nyfrac) { - auto interpolate_horizontal = [&](uint8x8_t left, uint8x8_t right) { - return vmlal_u8(vmull_u8(nxfrac, left), xfrac, right); - }; - - // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. - const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); - - auto interpolate_vertical = [&](uint16x4_t a, uint16x4_t b, uint16x4_t frac, - uint16x4_t nfrac) { - uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, a, nfrac), b, frac); - return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); - }; - - uint16x8_t line0 = interpolate_horizontal(v00, v10); - uint16x8_t line1 = interpolate_horizontal(v01, v11); - - uint16x4_t lo = - interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1), - vget_low_u16(yfrac), vget_low_u16(nyfrac)); - uint16x4_t hi = - interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1), - vget_high_u16(yfrac), vget_high_u16(nyfrac)); - - // Discard upper 8 bits of each element and combine low and high parts into - // a single register. - return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi)); + return vbslq_u16(in_range, vmovl_u8(pixels), v_border_); } Rows src_rows_; uint16x8_t v_src_stride_; uint16x8_t v_width_; uint16x8_t v_height_; - uint8x8_t v_border_; + uint16x8_t v_border_; }; // end of class RemapS16Point5ConstantBorder template <> @@ -516,11 +488,12 @@ class RemapS16Point5ConstantBorder { void transform_pixels(Columns dst) { uint16x8_t a = load_pixels(x0_, y0_); - uint16x8_t b = load_pixels(x0_, y1_); - uint16x8_t c = load_pixels(x1_, y0_); + uint16x8_t b = load_pixels(x1_, y0_); + uint16x8_t c = load_pixels(x0_, y1_); uint16x8_t d = load_pixels(x1_, y1_); - uint16x8_t result = interpolate(a, b, c, d); + uint16x8_t result = + interpolate(a, b, c, d, xfrac_, yfrac_, nxfrac_, nyfrac_); vst1q_u16(&dst[0], result); } @@ -559,56 +532,6 @@ class RemapS16Point5ConstantBorder { return vbslq_u16(in_range, pixels, v_border_); } - uint16x8_t interpolate(uint16x8_t a, uint16x8_t b, uint16x8_t c, - uint16x8_t d) { - auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right, - uint16x4_t frac, - uint16x4_t nfrac) -> uint32x4_t { - return vmlal_u16(vmull_u16(nfrac, left), frac, right); - }; - - auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right, - uint16x8_t frac, - uint16x8_t nfrac) -> uint32x4_t { - return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), - vget_low_u16(frac), vget_low_u16(nfrac)); - }; - - auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right, - uint16x8_t frac, - uint16x8_t nfrac) -> uint32x4_t { - return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), - vget_high_u16(frac), vget_high_u16(nfrac)); - }; - - // Offset pixel values by 0.5 before rounding down. - const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); - - auto interpolate_vertical = [&](uint32x4_t a, uint32x4_t b, uint32x4_t frac, - uint32x4_t nfrac) -> uint32x4_t { - uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, a, nfrac), b, frac); - return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); - }; - - uint32x4_t line0_low = interpolate_horizontal_low(a, c, xfrac_, nxfrac_); - uint32x4_t line1_low = interpolate_horizontal_low(b, d, xfrac_, nxfrac_); - uint32x4_t line0_high = interpolate_horizontal_high(a, c, xfrac_, nxfrac_); - uint32x4_t line1_high = interpolate_horizontal_high(b, d, xfrac_, nxfrac_); - - uint32x4_t lo = interpolate_vertical(line0_low, line1_low, - vmovl_u16(vget_low_u16(yfrac_)), - vmovl_u16(vget_low_u16(nyfrac_))); - uint32x4_t hi = - interpolate_vertical(line0_high, line1_high, vmovl_high_u16(yfrac_), - vmovl_high_u16(nyfrac_)); - - // Discard upper 16 bits of each element (low the precision back to original - // 16 bits) - uint16x8_t result = - vuzp1q_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)); - return result; - } - private: Rows src_rows_; uint16x8_t v_src_element_stride_; @@ -625,6 +548,312 @@ class RemapS16Point5ConstantBorder { int16x8_t y1_; }; // end of class RemapS16Point5ConstantBorder +inline void get_coordinates(Columns mapxy, + Columns mapfrac, uint16x8_t &x, + uint16x8_t &y, uint16x8_t &xfrac, + uint16x8_t &yfrac) { + int16x8x2_t xy = vld2q_s16(&mapxy[0]); + x = xy.val[0]; + y = xy.val[1]; + + uint16x8_t frac = vld1q_u16(&mapfrac[0]); + xfrac = vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + yfrac = vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), + vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1)); +} + +inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, + uint16x4_t y1, uint32x4_t &offsets_a, + uint32x4_t &offsets_b, uint32x4_t &offsets_c, + uint32x4_t &offsets_d, + uint16x4_t v_src_element_stride) { + // Multiply by 4 because of channels + uint32x4_t x0_scaled = vshll_n_u16(x0, 2); + uint32x4_t x1_scaled = vshll_n_u16(x1, 2); + + // Calculate offsets from coordinates (y * element_stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride); + offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride); + offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride); + offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride); +} + +inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low, + uint8_t frac_high) { + uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low, + frac_high, frac_high, frac_high, frac_high}; + return vmovl_u8(frac_low_high); +} + +inline uint64_t load_32bit(const uint8_t *src) { + uint32_t value = 0; + memcpy(&value, src, sizeof(uint32_t)); + return static_cast(value); +} + +inline uint8x16_t load_4px_4ch(Rows src_rows, + uint32x4_t offsets) { + uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) | + (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32); + uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) | + (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32); + return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23)); +} + +inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns dst) { + vst1q_u8_x2(&dst[0], res); +} + +inline uint16x8_t load_2px_4ch(Rows src_rows, + uint32x2_t offsets) { + return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]), + vld1_u16(&src_rows[vget_lane_u32(offsets, 1)])); +} + +inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns dst) { + vst1q_u16_x4(&dst[0], res); +} + +// Replicate border specific functions +inline void get_coordinates_replicate(Columns mapxy, + Columns mapfrac, + uint16x8_t &x0, uint16x8_t &y0, + uint16x8_t &x1, uint16x8_t &y1, + uint16x8_t &xfrac, uint16x8_t &yfrac, + int16x8_t v_xmax, int16x8_t v_ymax) { + get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); + + // Zero the xfrac (or yfrac) if x (or y) are below zero + xfrac = vbslq_u16(vcltq_s16(x0, vdupq_n_s16(0)), vdupq_n_u16(0), xfrac); + yfrac = vbslq_u16(vcltq_s16(y0, vdupq_n_s16(0)), vdupq_n_u16(0), yfrac); + + // Clamp coordinates to within the dimensions of the source image + x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax))); + y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax))); + + // x1 = x0 + 1, except if it's already xmax + x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax)); + y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax)); +} + +inline void load_pixels_u8_4ch_replicate( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b, + uint8x16_t &c, uint8x16_t &d) { + a = load_4px_4ch(src_rows, offsets_a); + b = load_4px_4ch(src_rows, offsets_b); + c = load_4px_4ch(src_rows, offsets_c); + d = load_4px_4ch(src_rows, offsets_d); +} + +inline void load_pixels_u16_4ch_replicate( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo, + uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo, + uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) { + a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); + b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); + c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); + d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); + + a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); + b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); + c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); + d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); +} + +template +class RemapS16Point5Replicate4ch; + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint16x8_t xfrac, yfrac; + get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, + v_xmax_, v_ymax_); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint8x16_t a, b, c, d; + uint8x16x2_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a, b, c, d); + + // Doubled fractions 001122..., low part + uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); + uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); + uint16x8_t nxfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + uint16x8_t nyfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + uint16x8_t res0 = interpolate( + vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), vmovl_u8(vget_low(c)), + vmovl_u8(vget_low(d)), vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), + vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); + uint16x8_t res1 = interpolate( + vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), + vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), vzip2q(yfrac2, yfrac2), + vzip2q(nxfrac2, nxfrac2), vzip2q(nyfrac2, nyfrac2)); + res.val[0] = + vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a, b, c, d); + // Doubled fractions 001122..., high part + xfrac2 = vzip2q(xfrac, xfrac); + yfrac2 = vzip2q(yfrac, yfrac); + nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + res0 = interpolate(vmovl_u8(vget_low(a)), vmovl_u8(vget_low(b)), + vmovl_u8(vget_low(c)), vmovl_u8(vget_low(d)), + vzip1q(xfrac2, xfrac2), vzip1q(yfrac2, yfrac2), + vzip1q(nxfrac2, nxfrac2), vzip1q(nyfrac2, nyfrac2)); + res1 = interpolate(vmovl_high_u8(a), vmovl_high_u8(b), vmovl_high_u8(c), + vmovl_high_u8(d), vzip2q(xfrac2, xfrac2), + vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), + vzip2q(nyfrac2, nyfrac2)); + res.val[1] = + vuzp1q_u8(vreinterpretq_u8_u16(res0), vreinterpretq_u8_u16(res1)); + + store_pixels_u8_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint16x8_t xfrac, yfrac; + get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, + v_xmax_, v_ymax_); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; + uint16x8x4_t res; + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a_low, a_high, b_low, b_high, + c_low, c_high, d_low, d_high); + + // Doubled fractions 001122..., low part + uint16x8_t xfrac2 = vzip1q(xfrac, xfrac); + uint16x8_t yfrac2 = vzip1q(yfrac, yfrac); + uint16x8_t nxfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + uint16x8_t nyfrac2 = + vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + res.val[0] = + interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), + vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), + vzip1q(nyfrac2, nyfrac2)); + res.val[1] = + interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), + vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), + vzip2q(nyfrac2, nyfrac2)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a_low, a_high, b_low, b_high, + c_low, c_high, d_low, d_high); + // Doubled fractions 001122..., high part + xfrac2 = vzip2q(xfrac, xfrac); + yfrac2 = vzip2q(yfrac, yfrac); + nxfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2); + nyfrac2 = vsubq_u16(vdupq_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2); + // Quadrupled fractions (00001111) are passed to interpolate + res.val[2] = + interpolate(a_low, b_low, c_low, d_low, vzip1q(xfrac2, xfrac2), + vzip1q(yfrac2, yfrac2), vzip1q(nxfrac2, nxfrac2), + vzip1q(nyfrac2, nyfrac2)); + res.val[3] = + interpolate(a_high, b_high, c_high, d_high, vzip2q(xfrac2, xfrac2), + vzip2q(yfrac2, yfrac2), vzip2q(nxfrac2, nxfrac2), + vzip2q(nyfrac2, nyfrac2)); + + store_pixels_u16_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_element_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -656,13 +885,24 @@ kleidicv_error_t remap_s16point5( Rows dst_rows{dst, dst_stride, channels}; Rectangle rect{dst_width, dst_height}; if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, - border_value}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); - RemapS16Point5Replicate operation{src_rows, src_width, src_height}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5Replicate operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + RemapS16Point5Replicate4ch operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } } return KLEIDICV_OK; } diff --git a/kleidicv/src/transform/remap_s16point5_sve2.cpp b/kleidicv/src/transform/remap_s16point5_sve2.cpp index 489d37386..982717173 100644 --- a/kleidicv/src/transform/remap_s16point5_sve2.cpp +++ b/kleidicv/src/transform/remap_s16point5_sve2.cpp @@ -519,6 +519,357 @@ class RemapS16Point5ConstantBorder { svuint16_t& v_border_; }; // end of class RemapS16Point5ConstantBorder +template +class RemapS16Point5Replicate4ch; + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_stride_ = svdup_u16(src_rows.stride()); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = MapVecTraits::svptrue(); + vector_path(pg, mapxy, mapfrac, dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + vector_path(pg, mapxy, mapfrac, dst, static_cast(length)); + }); + } + + void vector_path(svbool_t pg, Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + // Clamp coordinates to within the dimensions of the source image + svuint16_t x0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint16_t x1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); + + svuint16_t y1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); + svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); + svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); + + // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 + // channels + auto load_4ch_b = [&](svuint16_t x, svuint16_t y) { + return svreinterpret_u8_u32(svld1_gather_u32offset_u32( + pg_b, reinterpret_cast(&src_rows_[0]), + svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_))); + }; + auto load_4ch_t = [&](svuint16_t x, svuint16_t y) { + return svreinterpret_u8_u32(svld1_gather_u32offset_u32( + pg_t, reinterpret_cast(&src_rows_[0]), + svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_))); + }; + + FracVectorType frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t xfrac = + svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint16_t line0 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); + svuint16_t line1 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); + svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); + acc_b = svmlalb_u32(acc_b, line1, yfrac); + acc_t = svmlalt_u32(acc_t, line1, yfrac); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // bottom part + svuint8_t a = load_4ch_b(x0, y0); + svuint8_t b = load_4ch_b(x1, y0); + svuint8_t c = load_4ch_b(x0, y1); + svuint8_t d = load_4ch_b(x1, y1); + // from xfrac, we need the bottom part twice + svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); + svuint16_t nxfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); + svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); + svuint16_t nyfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_bb = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_bt = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_b = + svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); + + // top part + a = load_4ch_t(x0, y0); + b = load_4ch_t(x1, y0); + c = load_4ch_t(x0, y1); + d = load_4ch_t(x1, y1); + // from xfrac, we need the top part twice + svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); + svuint16_t nxfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); + svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); + svuint16_t nyfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_tb = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_tt = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_t = + svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); + + svbool_t pg_low = svwhilelt_b32(0L, step); + svbool_t pg_high = svwhilelt_b32(svcntw(), static_cast(step)); + svuint32_t res_low = + svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + svuint32_t res_high = + svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + mapxy += step; + svst1_u32(pg_low, reinterpret_cast(&dst[0]), res_low); + svst1_u32(pg_high, reinterpret_cast(&dst[0]) + svcntw(), + res_high); + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint16_t; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height, svuint32_t& v_src_stride, + svint32_t& v_x_max, svint32_t& v_y_max) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_stride_ = svdup_u32(src_rows.stride()); + v_xmax_ = svdup_s32(static_cast(src_width - 1)); + v_ymax_ = svdup_s32(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, svcntw()}; + loop.unroll_once([&](size_t step) { + vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), svptrue_b64(), + svptrue_b64(), mapxy, mapfrac, dst, + static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = svwhilelt_b32(step, step + length); + svbool_t pg64_b = svtrn1_b32(pg, svpfalse()); + svbool_t pg64_t = svtrn2_b32(pg, svpfalse()); + svbool_t pg_low = svzip1_b32(pg, svpfalse()); + svbool_t pg_high = svzip2_b32(pg, svpfalse()); + vector_path(pg, pg64_b, pg64_t, pg_low, pg_high, mapxy, mapfrac, dst, + static_cast(length)); + }); + } + + void vector_path(svbool_t pg, svbool_t pg64_b, svbool_t pg64_t, + svbool_t pg_low, svbool_t pg_high, + Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + // Load one vector of xy: even coordinates are x, odd are y + svint16_t xy = svreinterpret_s16_u32( + svld1_u32(pg, reinterpret_cast(&mapxy[0]))); + svint32_t x = svmovlb(xy); + svint32_t y = svmovlt(xy); + // Clamp coordinates to within the dimensions of the source image + svuint32_t x0 = svreinterpret_u32_s32( + svmax_x(pg, svdup_n_s32(0), svmin_x(pg, x, v_xmax_))); + svuint32_t y0 = svreinterpret_u32_s32( + svmax_x(pg, svdup_n_s32(0), svmin_x(pg, y, v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint32_t x1 = svreinterpret_u32_s32(svmax_x( + pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, x, 1), v_xmax_))); + svuint32_t y1 = svreinterpret_u32_s32(svmax_x( + pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, y, 1), v_ymax_))); + + auto load_4ch = [&](svbool_t pg, svuint64_t offsets) { + return svreinterpret_u16_u64(svld1_gather_u64offset_u64( + pg, reinterpret_cast(&src_rows_[0]), offsets)); + }; + + svuint16_t xfrac, yfrac, nxfrac, nyfrac; + { + // Fractions are loaded into even lanes + svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0])); + + // Fractions are doubled, 00112233... (will be doubled again later) + svuint16_t frac = svtrn1(rawfrac, rawfrac); + + xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); + nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); + } + + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); + svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); + svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); + svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = + svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); + svuint32_t acc_t = + svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); + acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); + acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit) + // Calculation is done in 2 parts, top and bottom + svuint16_t res_b, res_t; + + { // bottom + svuint64_t x0w = svshllb_n_u64(x0, 3); + svuint64_t x1w = svshllb_n_u64(x1, 3); + svuint64_t ys0w = svmullb_u64(y0, v_src_stride_); + svuint64_t ys1w = svmullb_u64(y1, v_src_stride_); + svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w); + svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w); + svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w); + svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w); + + svuint16_t a = load_4ch(pg64_b, offsets_a); + svuint16_t b = load_4ch(pg64_b, offsets_b); + svuint16_t c = load_4ch(pg64_b, offsets_c); + svuint16_t d = load_4ch(pg64_b, offsets_d); + + // Copy even lanes twice -> 000022224444... these are the "bottom" + // fractions + svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); + svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); + svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); + svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32( + svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); + + res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + { // top + svuint64_t x0w = svshllt_n_u64(x0, 3); + svuint64_t x1w = svshllt_n_u64(x1, 3); + svuint64_t ys0w = svmullt_u64(y0, v_src_stride_); + svuint64_t ys1w = svmullt_u64(y1, v_src_stride_); + svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w); + svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w); + svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w); + svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w); + + svuint16_t a = load_4ch(pg64_t, offsets_a); + svuint16_t b = load_4ch(pg64_t, offsets_b); + svuint16_t c = load_4ch(pg64_t, offsets_c); + svuint16_t d = load_4ch(pg64_t, offsets_d); + + // Copy odd lanes twice -> 111133335555... these are the "top" + // fractions + svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); + svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); + svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); + svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32( + svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); + + res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + svuint64_t res_low = + svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); + svuint64_t res_high = + svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); + svst1_u64(pg_low, reinterpret_cast(&dst[0]), res_low); + svst1_u64(pg_high, reinterpret_cast(&dst[0]) + svcntd(), + res_high); + mapxy += step; + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint32_t& v_src_stride_; + svint32_t& v_xmax_; + svint32_t& v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -554,16 +905,37 @@ kleidicv_error_t remap_s16point5(const T* src, size_t src_stride, if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { svuint16_t sv_width, sv_height, sv_border; - RemapS16Point5ConstantBorder operation{ - src_rows, src_width, src_height, border_value, - sv_src_stride, sv_width, sv_height, sv_border}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5ConstantBorder operation{ + src_rows, src_width, src_height, border_value, + sv_src_stride, sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); svint16_t sv_xmax, sv_ymax; - RemapS16Point5Replicate operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5Replicate operation{src_rows, src_width, src_height, + sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + if constexpr (std::is_same::value) { + RemapS16Point5Replicate4ch operation{ + src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } + if constexpr (std::is_same::value) { + svuint32_t stride; + svint32_t xmax, ymax; + RemapS16Point5Replicate4ch operation{src_rows, src_width, src_height, + stride, xmax, ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } + } } return KLEIDICV_OK; } diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index ca82aeda0..c293ef1cb 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -83,8 +83,10 @@ Remap_S16_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16S Remap_S16_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' Remap_S16_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_NEAREST, BORDER_CONSTANT)' Remap_S16Point5_U8_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' +Remap_S16Point5_U8_Replicate_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC4, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U8_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' Remap_S16Point5_U16_Replicate: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' +Remap_S16Point5_U16_Replicate_4ch: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC4, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' Remap_S16Point5_U16_Constant: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 16UC1, 16SC2, INTER_LINEAR, BORDER_CONSTANT)' Remap_F32_U8_Replicate_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_REPLICATE)' Remap_F32_U8_Constant_Nearest: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 32FC1, INTER_NEAREST, BORDER_CONSTANT)' diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index bea82b90b..e85b611bc 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -36,14 +36,17 @@ template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, kleidicv_border_type_t border_type, const ScalarType *border_value) { + // Width is the number of pixels in a row, but Array2D does not handle that + const ptrdiff_t src_width = + static_cast(src.width() / src.channels()); + const ptrdiff_t src_height = static_cast(src.height()); + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { - x = std::clamp( - x, 0, static_cast(src.width() / src.channels()) - 1); - y = std::clamp(y, 0, static_cast(src.height()) - 1); + x = std::clamp(x, 0, src_width - 1); + y = std::clamp(y, 0, src_height - 1); } else { assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); - if (x * src.channels() >= src.width() || - y >= static_cast(src.height()) || x < 0 || y < 0) { + if (x >= src_width || y >= src_height || x < 0 || y < 0) { return border_value; } } @@ -147,12 +150,11 @@ class RemapS16 : public testing::Test { calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, remap_s16()( + source.data(), source.stride(), src_w, + source.height(), actual.data(), actual.stride(), + dst_w, actual.height(), channels, mapxy.data(), + mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -176,12 +178,11 @@ class RemapS16 : public testing::Test { calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, remap_s16()( + source.data(), source.stride(), src_w, + source.height(), actual.data(), actual.stride(), + dst_w, actual.height(), channels, mapxy.data(), + mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -524,8 +525,9 @@ class RemapS16Point5 : public testing::Test { } } - // This part is the same as execute_test() but without initializing source. - // Corner Cases use the biggest possible source. + // This part is the same as execute_test() except source initialization. + // Corner Cases use the biggest possible source, so it is only initializing + // the edges. size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -559,12 +561,27 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, - mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_s16point5()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); + + if (expected.compare_to(actual, 1)) { + if (source.width() < 100 && source.height() < 100) { + std::cout << "source:\n"; + dump(&source); + } + std::cout << "mapxy:\n"; + dump(&mapxy); + std::cout << "mapfrac:\n"; + dump(&mapfrac); + std::cout << "expected:\n"; + dump(&expected); + std::cout << "actual:\n"; + dump(&actual); + } EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } @@ -589,12 +606,27 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, - mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_s16point5()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); + + if (expected.compare_to(actual)) { + if (source.width() < 100 && source.height() < 100) { + std::cout << "source:\n"; + dump(&source); + } + std::cout << "mapxy:\n"; + dump(&mapxy); + std::cout << "mapfrac:\n"; + dump(&mapfrac); + std::cout << "expected:\n"; + dump(&expected); + std::cout << "actual:\n"; + dump(&actual); + } EXPECT_EQ_ARRAY2D(actual, expected); } @@ -623,17 +655,17 @@ class RemapS16Point5 : public testing::Test { for (size_t row = 0; row < expected.height(); row++) { for (size_t column = 0; column < expected.width() / src.channels(); ++column) { + // Clang-tidy thinks mapfrac may contain garbage, but it is fully + // initialized at all code paths and the map size always equals dst + // map pixel size + // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) + uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1); + uint8_t y_frac = + (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); + // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) + const int16_t *coords = mapxy.at(row, column * 2); + ptrdiff_t x = coords[0], y = coords[1]; for (size_t ch = 0; ch < src.channels(); ++ch) { - // Clang-tidy thinks mapfrac may contain garbage, but it is fully - // initialized at all code paths and the map size always equals dst - // map pixel size - // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) - uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1); - uint8_t y_frac = - (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); - // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) - const int16_t *coords = mapxy.at(row, column * 2); - int16_t x = coords[0], y = coords[1]; *expected.at(row, column * src.channels() + ch) = lerp_2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); @@ -646,48 +678,69 @@ class RemapS16Point5 : public testing::Test { using RemapS16Point5ElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapS16Point5, RemapS16Point5ElementTypes); +template +size_t defaultWidth() { + return 3 * test::Options::vector_lanes() - 1; +} + +size_t defaultHeight() { return 4; } + TYPED_TEST(RemapS16Point5, RandomNoPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 0; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, padding); + TestFixture::test_random(w, h, w, h, 1, border_type, border_value, 0); } } +// TODO: Modify tests to also run constant border once implemented +TYPED_TEST(RemapS16Point5, RandomNoPadding4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 0; + TestFixture::test_random(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, + nullptr, padding); +} + TYPED_TEST(RemapS16Point5, BlendPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 13; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, padding); + TestFixture::test_blend(w, h, w, h, 1, border_type, border_value, 13); } } +TYPED_TEST(RemapS16Point5, BlendPadding4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 7; + TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, + nullptr, padding); +} + TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 13; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, padding); + TestFixture::test_outside_random(w, h, w, h, 1, border_type, border_value, + 13); } } +TYPED_TEST(RemapS16Point5, OutsideRandomPadding4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 11; + TestFixture::test_outside_random( + w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, padding); +} + TYPED_TEST(RemapS16Point5, BlendBigStride) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 16; + size_t src_w = defaultWidth(); + size_t src_h = defaultHeight(); size_t dst_w = src_w; size_t dst_h = src_h; size_t channels = 1; @@ -698,6 +751,16 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) { } } +TYPED_TEST(RemapS16Point5, BlendBigStride4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = + std::numeric_limits::max() / channels - w * channels; + TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, + nullptr, padding); +} + TYPED_TEST(RemapS16Point5, CornerCases) { size_t src_w = std::numeric_limits::max() + 1; size_t src_h = std::numeric_limits::max() + 1; @@ -711,6 +774,18 @@ TYPED_TEST(RemapS16Point5, CornerCases) { } } +TYPED_TEST(RemapS16Point5, CornerCases4ch) { + size_t src_w = 100; + size_t src_h = 8; + size_t dst_w = defaultWidth(); + size_t dst_h = defaultHeight(); + size_t channels = 4; + size_t padding = 17; + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, + padding); +} + TYPED_TEST(RemapS16Point5, NullPointer) { const TypeParam src[4] = {}; TypeParam dst[1]; @@ -819,17 +894,20 @@ TYPED_TEST(RemapS16Point5, UnsupportedBigStride) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(RemapS16Point5, UnsupportedTwoChannels) { +TYPED_TEST(RemapS16Point5, UnsupportedChannels) { const TypeParam src[1] = {}; TypeParam dst[8]; int16_t mapxy[16] = {}; uint16_t mapfrac[8] = {}; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_s16point5()( - src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 2, - mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1, 1, 1, dst, 8, 8, 1, 2, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1, 1, 1, dst, 8, 8, 1, 3, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { @@ -844,6 +922,33 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REFLECT, src)); } +TYPED_TEST(RemapS16Point5, UnsupportedConstantBorder4ch) { + const TypeParam src[1] = {}; + TypeParam dst[8]; + int16_t mapxy[16] = {}; + uint16_t mapfrac[8] = {}; + + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()(src, sizeof(TypeParam), 1, 1, dst, 8, + 8, 1, 4, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_CONSTANT, src)); +} + +TYPED_TEST(RemapS16Point5, UnsupportedBigStride4ch) { + const TypeParam src[1] = {}; + TypeParam dst[8]; + int16_t mapxy[16] = {}; + uint16_t mapfrac[8] = {}; + + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, + (std::numeric_limits::max() / 4 + 1L) * sizeof(TypeParam), + 1, 1, dst, 8, 8, 1, 4, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_CONSTANT, src)); +} + TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) { const TypeParam src[1] = {}; TypeParam dst[8]; diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 9a2b9affa..c16f6356b 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -722,12 +722,24 @@ TEST_P(Thread, remap_s16point5_u8_border_replicate) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u8_border_replicate_4ch) { + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + TEST_P(Thread, remap_s16point5_u16_border_replicate) { check_remap_s16point5(kleidicv_remap_s16point5_u16, kleidicv_thread_remap_s16point5_u16, 1, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u16_border_replicate_4ch) { + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + TEST_P(Thread, remap_s16point5_u8_border_constant) { const uint8_t border_value = 0; check_remap_s16point5(kleidicv_remap_s16point5_u8, -- GitLab