From 5e48d83ff2702058b6d2890693a22b207fd8638a Mon Sep 17 00:00:00 2001 From: Richard Wells Date: Tue, 18 Feb 2025 15:06:45 +0000 Subject: [PATCH 1/2] Implement 4-channel Remap16Point5 replicate border. --- kleidicv/include/kleidicv/transform/remap.h | 5 +- .../src/transform/remap_s16point5_neon.cpp | 349 +++++++++++++- kleidicv/src/transform/remap_sc.h | 454 +++++++++++++++++- test/api/test_remap.cpp | 206 +++++--- 4 files changed, 929 insertions(+), 85 deletions(-) diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index dcf43bd64..285fc22f7 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -38,13 +38,14 @@ inline bool remap_s16point5_is_implemented( size_t channels) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value || std::is_same::value) { - return (src_stride / sizeof(T) <= std::numeric_limits::max() && + return (src_stride / sizeof(T) <= + (std::numeric_limits::max() / channels) && dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && - channels == 1); + (channels == 1 || channels == 4)); } else { return false; } diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp index 27893a0a3..8ffe4601a 100644 --- a/kleidicv/src/transform/remap_s16point5_neon.cpp +++ b/kleidicv/src/transform/remap_s16point5_neon.cpp @@ -625,6 +625,332 @@ class RemapS16Point5ConstantBorder { int16x8_t y1_; }; // end of class RemapS16Point5ConstantBorder +template +class RemapS16Point5ReplicateFourChannels; + +template <> +class RemapS16Point5ReplicateFourChannels { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = neon::VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5ReplicateFourChannels(Rows src_rows, + size_t src_width, size_t src_height) + : src_rows_{src_rows}, + v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + + void get_map_coordinates(Columns mapxy, + Columns mapfrac, uint16x8_t &x0, + uint16x8_t &y0, uint16x8_t &x1, uint16x8_t &y1, + uint16x8_t &xfrac, uint16x8_t &yfrac) { + MapVector2Type xy = vld2q_s16(&mapxy[0]); + FracVectorType frac = vld1q_u16(&mapfrac[0]); + xfrac = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), + vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); + yfrac = vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), + vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), + vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); + + // Clamp coordinates to within the dimensions of the source image + x0 = vreinterpretq_u16_s16( + vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); + y0 = vreinterpretq_u16_s16( + vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); + + // x1 = x0 + 1, except if it's already xmax + x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_)); + y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_)); + } + + void get_offsets(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, uint16x4_t y1, + uint32x4_t &offsets_a, uint32x4_t &offsets_b, + uint32x4_t &offsets_c, uint32x4_t &offsets_d) { + // Multiply by 4 because of channels + uint32x4_t x0_scaled = vshll_n_u16(x0, 2); + uint32x4_t x1_scaled = vshll_n_u16(x1, 2); + + // Calculate offsets from coordinates (y * stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + offsets_a = vmlal_u16(x0_scaled, y0, v_src_stride_); + offsets_b = vmlal_u16(x1_scaled, y0, v_src_stride_); + offsets_c = vmlal_u16(x0_scaled, y1, v_src_stride_); + offsets_d = vmlal_u16(x1_scaled, y1, v_src_stride_); + }; + + uint16x4_t interpolate(uint16x4_t a, uint16x4_t b, uint16x4_t c, uint16x4_t d, + uint16_t xfrac, uint16_t yfrac) { + uint16x4_t line0 = + vmla_n_u16(vmul_n_u16(b, xfrac), a, REMAP16POINT5_FRAC_MAX - xfrac); + uint32x4_t line0_lerpd = + vmlal_n_u16(vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, + REMAP16POINT5_FRAC_MAX - yfrac); + uint16x4_t line1 = + vmla_n_u16(vmul_n_u16(d, xfrac), c, REMAP16POINT5_FRAC_MAX - xfrac); + return vshrn_n_u32(vmlal_n_u16(line0_lerpd, line1, yfrac), + 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint64_t load_32bit(const ScalarType *src) { + uint32_t value; + memcpy(&value, src, sizeof(uint32_t)); + return static_cast(value); + }; + + uint16x8_t load_64bit(uint32_t offset_low, uint32_t offset_high) { + uint64_t acc = load_32bit(&src_rows_[offset_low]) | + (load_32bit(&src_rows_[offset_high]) << 32); + return vmovl_u8(vset_lane_u64(acc, vdup_n_u64(0), 0)); + }; + + uint16x8_t load_01(uint32x4_t offsets) { + return load_64bit(vgetq_lane_u32(offsets, 0), vgetq_lane_u32(offsets, 1)); + }; + + uint16x8_t load_23(uint32x4_t offsets) { + return load_64bit(vgetq_lane_u32(offsets, 2), vgetq_lane_u32(offsets, 3)); + }; + + uint8x16_t load_and_interpolate(uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, + uint16x4_t xfrac, uint16x4_t yfrac) { + uint16x8_t a = load_01(offsets_a); + uint16x8_t b = load_01(offsets_b); + uint16x8_t c = load_01(offsets_c); + uint16x8_t d = load_01(offsets_d); + + uint16x4x4_t res; + res.val[0] = interpolate(vget_low_u16(a), vget_low_u16(b), vget_low_u16(c), + vget_low_u16(d), vget_lane_u16(xfrac, 0), + vget_lane_u16(yfrac, 0)); + res.val[1] = interpolate(vget_high_u16(a), vget_high_u16(b), + vget_high_u16(c), vget_high_u16(d), + vget_lane_u16(xfrac, 1), vget_lane_u16(yfrac, 1)); + + a = load_23(offsets_a); + b = load_23(offsets_b); + c = load_23(offsets_c); + d = load_23(offsets_d); + + res.val[2] = interpolate(vget_low_u16(a), vget_low_u16(b), vget_low_u16(c), + vget_low_u16(d), vget_lane_u16(xfrac, 2), + vget_lane_u16(yfrac, 2)); + res.val[3] = interpolate(vget_high_u16(a), vget_high_u16(b), + vget_high_u16(c), vget_high_u16(d), + vget_lane_u16(xfrac, 3), vget_lane_u16(yfrac, 3)); + + return vuzp1q_u8(vcombine(res.val[0], res.val[1]), + vcombine(res.val[2], res.val[3])); + } + + void store_pixels(uint8x16_t res_low, uint8x16_t res_high, + Columns dst) { + uint8x16x2_t res; + res.val[0] = res_low; + res.val[1] = res_high; + vst1q_u8_x2(&dst[0], res); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1, xfrac, yfrac; + get_map_coordinates(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + + get_offsets(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, offsets_d); + uint8x16_t res_low = + load_and_interpolate(offsets_a, offsets_b, offsets_c, offsets_d, + vget_low_u16(xfrac), vget_low_u16(yfrac)); + + get_offsets(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d); + uint8x16_t res_high = + load_and_interpolate(offsets_a, offsets_b, offsets_c, offsets_d, + vget_high_u16(xfrac), vget_high_u16(yfrac)); + + store_pixels(res_low, res_high, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5ReplicateFourChannels + +// TODO: Refactor this to match the uint8_t layout +template <> +class RemapS16Point5ReplicateFourChannels { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = neon::VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5ReplicateFourChannels(Rows src_rows, + size_t src_width, size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + MapVector2Type xy = vld2q_s16(&mapxy[0]); + FracVectorType frac = vld1q_u16(&mapfrac[0]); + uint16x8_t xfrac = + vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), + vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); + uint16x8_t yfrac = + vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), + vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), + vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); + + // Clamp coordinates to within the dimensions of the source image + uint16x8_t x0 = vreinterpretq_u16_s16( + vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); + uint16x8_t y0 = vreinterpretq_u16_s16( + vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); + + // x1 = x0 + 1, except if it's already xmax + uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_)); + uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_)); + + auto load_16x4 = [&](uint32_t offset) { + return vld1_u64(reinterpret_cast(&src_rows_[offset])); + }; + + uint16_t xfrac_array[8], yfrac_array[8]; + vst1q_u16(xfrac_array, xfrac); + vst1q_u16(yfrac_array, yfrac); + + auto interpolate = [xfrac_array, yfrac_array](uint16x4_t a, uint16x4_t b, + uint16x4_t c, uint16x4_t d, + size_t index) { + uint32x4_t line0 = + vmlal_n_u16(vmull_n_u16(b, xfrac_array[index]), a, + REMAP16POINT5_FRAC_MAX - xfrac_array[index]); + uint32x4_t line0_lerpd = vmlaq_n_u32( + vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, + static_cast(REMAP16POINT5_FRAC_MAX - yfrac_array[index])); + uint32x4_t line1 = + vmlal_n_u16(vmull_n_u16(d, xfrac_array[index]), c, + REMAP16POINT5_FRAC_MAX - xfrac_array[index]); + return vshrn_n_u32( + vmlaq_n_u32(line0_lerpd, line1, + static_cast(yfrac_array[index])), + 2 * REMAP16POINT5_FRAC_BITS); + }; + +#define LOAD_AND_INTERPOLATE(a, b, c, d, index1, index2) \ + interpolate(load_16x4(vgetq_lane_u32((a), (index1))), \ + load_16x4(vgetq_lane_u32((b), (index1))), \ + load_16x4(vgetq_lane_u32((c), (index1))), \ + load_16x4(vgetq_lane_u32((d), (index1))), (index2)) + + // Calculate offsets from coordinates (y * element_stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + uint16x8x4_t res; + { + // multiply by 4 because of channels + uint32x4_t x0_low = vshll_n_u16(vget_low_u16(x0), 2); + uint32x4_t x1_low = vshll_n_u16(vget_low_u16(x1), 2); + uint32x4_t offsets_a = + vmlal_u16(x0_low, vget_low_u16(y0), v_src_element_stride_); + uint32x4_t offsets_b = + vmlal_u16(x1_low, vget_low_u16(y0), v_src_element_stride_); + uint32x4_t offsets_c = + vmlal_u16(x0_low, vget_low_u16(y1), v_src_element_stride_); + uint32x4_t offsets_d = + vmlal_u16(x1_low, vget_low_u16(y1), v_src_element_stride_); + + uint16x4_t res0 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 0, 0); + uint16x4_t res1 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 1, 1); + uint16x4_t res2 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 2, 2); + uint16x4_t res3 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 3, 3); + + res.val[0] = vcombine(res0, res1); + res.val[1] = vcombine(res2, res3); + } + + { + // multiply by 4 because of channels + uint32x4_t x0_high = vshll_high_n_u16(x0, 2); + uint32x4_t x1_high = vshll_high_n_u16(x1, 2); + uint32x4_t offsets_a = + vmlal_u16(x0_high, vget_high_u16(y0), v_src_element_stride_); + uint32x4_t offsets_b = + vmlal_u16(x1_high, vget_high_u16(y0), v_src_element_stride_); + uint32x4_t offsets_c = + vmlal_u16(x0_high, vget_high_u16(y1), v_src_element_stride_); + uint32x4_t offsets_d = + vmlal_u16(x1_high, vget_high_u16(y1), v_src_element_stride_); + + uint16x4_t res0 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 0, 4); + uint16x4_t res1 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 1, 5); + uint16x4_t res2 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 2, 6); + uint16x4_t res3 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, + offsets_d, 3, 7); + + res.val[2] = vcombine(res0, res1); + res.val[3] = vcombine(res2, res3); + } + + vst1q_u16_x4(&dst[0], res); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_element_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5ReplicateFourChannels + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -656,13 +982,26 @@ kleidicv_error_t remap_s16point5( Rows dst_rows{dst, dst_stride, channels}; Rectangle rect{dst_width, dst_height}; if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, - border_value}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + // TODO + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); - RemapS16Point5Replicate operation{src_rows, src_width, src_height}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5Replicate operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + RemapS16Point5ReplicateFourChannels operation{src_rows, src_width, + src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } } return KLEIDICV_OK; } diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index a0e27f169..8bf246474 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -580,6 +580,433 @@ class RemapS16Point5Replicate { MapVectorType& v_ymax_; }; // end of class RemapS16Point5Replicate +template +class RemapS16Point5ReplicateFourChannels; + +template <> +class RemapS16Point5ReplicateFourChannels { + public: + using ScalarType = uint8_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5ReplicateFourChannels(Rows src_rows, + size_t src_width, size_t src_height, + svuint16_t& v_src_stride, + MapVectorType& v_x_max, + MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_stride_ = svdup_u16(src_rows.stride()); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = MapVecTraits::svptrue(); + vector_path(pg, mapxy, mapfrac, dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + vector_path(pg, mapxy, mapfrac, dst, static_cast(length)); + }); + } + + void vector_path(svbool_t pg, Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + // Clamp coordinates to within the dimensions of the source image + svuint16_t x0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint16_t x1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); + + svuint16_t y1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); + svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); + svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); + + //// NEW PART + // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 + // channels + auto load_4ch_b = [&](svuint16_t x, svuint16_t y) { + return svreinterpret_u8_u32(svld1_gather_u32offset_u32( + pg_b, reinterpret_cast(&src_rows_[0]), + svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_))); + }; + auto load_4ch_t = [&](svuint16_t x, svuint16_t y) { + return svreinterpret_u8_u32(svld1_gather_u32offset_u32( + pg_t, reinterpret_cast(&src_rows_[0]), + svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_))); + }; + + FracVectorType frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t xfrac = + svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint16_t line0 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); + svuint16_t line1 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); + svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); + acc_b = svmlalb_u32(acc_b, line1, yfrac); + acc_t = svmlalt_u32(acc_t, line1, yfrac); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // bottom part + svuint8_t a = load_4ch_b(x0, y0); + svuint8_t b = load_4ch_b(x1, y0); + svuint8_t c = load_4ch_b(x0, y1); + svuint8_t d = load_4ch_b(x1, y1); + // from xfrac, we need the bottom part twice + svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); + svuint16_t nxfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); + svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); + svuint16_t nyfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_bb = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_bt = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_b = + svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); + + // top part + a = load_4ch_t(x0, y0); + b = load_4ch_t(x1, y0); + c = load_4ch_t(x0, y1); + d = load_4ch_t(x1, y1); + // from xfrac, we need the top part twice + svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); + svuint16_t nxfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); + svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); + svuint16_t nyfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_tb = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_tt = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_t = + svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); + + svbool_t pg_low = svwhilelt_b32(0L, step); + svbool_t pg_high = svwhilelt_b32(svcntw(), static_cast(step)); + svuint32_t res_low = + svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + svuint32_t res_high = + svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + mapxy += step; + svst1_u32(pg_low, reinterpret_cast(&dst[0]), res_low); + svst1_u32(pg_high, reinterpret_cast(&dst[0]) + svcntw(), + res_high); + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Point5ReplicateFourChannels + +template <> +class RemapS16Point5ReplicateFourChannels { + public: + using ScalarType = uint16_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5ReplicateFourChannels(Rows src_rows, + size_t src_width, size_t src_height, + svuint16_t& v_src_stride, + MapVectorType& v_x_max, + MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = MapVecTraits::svptrue(); + vector_path(pg, mapxy, mapfrac, dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + vector_path(pg, mapxy, mapfrac, dst, static_cast(length)); + }); + } + + void vector_path(svbool_t pg, Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + // Clamp coordinates to within the dimensions of the source image + svuint16_t x0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint16_t x1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); + + svuint16_t y1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); + + auto load_4ch_b = [&](svbool_t pg, svuint32_t offsets) { + return svreinterpret_u16_u64(svld1_gather_u64offset_u64( + pg, reinterpret_cast(&src_rows_[0]), + svshllb_n_u64(offsets, 1))); + }; + auto load_4ch_t = [&](svbool_t pg, svuint32_t offsets) { + return svreinterpret_u16_u64(svld1_gather_u64offset_u64( + pg, reinterpret_cast(&src_rows_[0]), + svshllt_n_u64(offsets, 1))); + }; + + FracVectorType frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t xfrac = + svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t nxfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); + svuint16_t nyfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); + svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); + svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); + svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = + svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); + svuint32_t acc_t = + svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); + acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); + acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // There are 4 channels: data is 4 times wider than the maps (4x16 bits vs + // 16-bit coordinates) So calculation is done in 4 parts: + // - 0,4,8,... (bottom-bottom) + // - 2,6,10,... (bottom-top) + // - 1,5,9,... (top-bottom) + // - 3,7,11,... (top-top) + svuint16_t res_bb, res_bt, res_tb, res_tt; + + svuint16_t fractbl_bb = + svreinterpret_u16_u64(svindex_u64(0, 0x0004000400040004UL)); + svuint16_t fractbl_bt = svreinterpret_u16_u64( + svadd_n_u64_x(svptrue_b64(), svreinterpret_u64_u16(fractbl_bb), + 0x0002000200020002UL)); + svuint16_t fractbl_tb = svreinterpret_u16_u64( + svadd_n_u64_x(svptrue_b64(), svreinterpret_u64_u16(fractbl_bb), + 0x0001000100010001UL)); + svuint16_t fractbl_tt = svreinterpret_u16_u64( + svadd_n_u64_x(svptrue_b64(), svreinterpret_u64_u16(fractbl_bb), + 0x0003000300030003UL)); + + { // bottom + svbool_t pg_bb = svwhilelt_b64(int64_t{0}, (step + 3) / 4); + svbool_t pg_bt = svwhilelt_b64(int64_t{0}, (step + 2) / 4); + + svuint32_t offsets_a_b = + svmlalb_u32(svshllb_n_u32(x0, 2), y0, v_src_element_stride_); + svuint32_t offsets_b_b = + svmlalb_u32(svshllb_n_u32(x1, 2), y0, v_src_element_stride_); + svuint32_t offsets_c_b = + svmlalb_u32(svshllb_n_u32(x0, 2), y1, v_src_element_stride_); + svuint32_t offsets_d_b = + svmlalb_u32(svshllb_n_u32(x1, 2), y1, v_src_element_stride_); + + { // bottom-bottom + svuint16_t a = load_4ch_b(pg_bb, offsets_a_b); + svuint16_t b = load_4ch_b(pg_bb, offsets_b_b); + svuint16_t c = load_4ch_b(pg_bb, offsets_c_b); + svuint16_t d = load_4ch_b(pg_bb, offsets_d_b); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_bb); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_bb); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_bb); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_bb); + + res_bb = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + { // bottom-top + svuint16_t a = load_4ch_t(pg_bt, offsets_a_b); + svuint16_t b = load_4ch_t(pg_bt, offsets_b_b); + svuint16_t c = load_4ch_t(pg_bt, offsets_c_b); + svuint16_t d = load_4ch_t(pg_bt, offsets_d_b); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_bt); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_bt); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_bt); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_bt); + + res_bt = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + } + + { // top + svbool_t pg_tb = svwhilelt_b64(int64_t{0}, (step + 1) / 4); + svbool_t pg_tt = svwhilelt_b64(int64_t{0}, step / 4); + + svuint32_t offsets_a_t = + svmlalt_u32(svshllt_n_u32(x0, 2), y0, v_src_element_stride_); + svuint32_t offsets_b_t = + svmlalt_u32(svshllt_n_u32(x1, 2), y0, v_src_element_stride_); + svuint32_t offsets_c_t = + svmlalt_u32(svshllt_n_u32(x0, 2), y1, v_src_element_stride_); + svuint32_t offsets_d_t = + svmlalt_u32(svshllt_n_u32(x1, 2), y1, v_src_element_stride_); + + { // top-bottom + svuint16_t a = load_4ch_b(pg_tb, offsets_a_t); + svuint16_t b = load_4ch_b(pg_tb, offsets_b_t); + svuint16_t c = load_4ch_b(pg_tb, offsets_c_t); + svuint16_t d = load_4ch_b(pg_tb, offsets_d_t); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_tb); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_tb); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_tb); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_tb); + + res_tb = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + { // top-top + svuint16_t a = load_4ch_t(pg_tt, offsets_a_t); + svuint16_t b = load_4ch_t(pg_tt, offsets_b_t); + svuint16_t c = load_4ch_t(pg_tt, offsets_c_t); + svuint16_t d = load_4ch_t(pg_tt, offsets_d_t); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_tt); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_tt); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_tt); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_tt); + + res_tt = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + } + + svbool_t pg_00 = svwhilelt_b64(0L, step); + svbool_t pg_01 = svwhilelt_b64(svcntd(), static_cast(step)); + svbool_t pg_02 = svwhilelt_b64(2 * svcntd(), static_cast(step)); + svbool_t pg_03 = svwhilelt_b64(3 * svcntd(), static_cast(step)); + + // Back-transforming is needed, svst4 cannot be used because the + // interleaving would need equal number of elements in the 4 vectors the + // results are now these: + // - 0,4,8,... (bottom-bottom) + // - 2,6,10,... (bottom-top) + // - 1,5,9,... (top-bottom) + // - 3,7,11,... (top-top) + // first pass will result in: + // - 0,2,4,... (lower and higher ones) + // - 1,3,5,... (lower and higher ones) + // second pass gives back 0,1,2,3,.... + + svuint64_t res_even_low = svzip1_u64(svreinterpret_u64_u16(res_bb), + svreinterpret_u64_u16(res_bt)); + svuint64_t res_even_high = svzip2_u64(svreinterpret_u64_u16(res_bb), + svreinterpret_u64_u16(res_bt)); + svuint64_t res_odd_low = svzip1_u64(svreinterpret_u64_u16(res_tb), + svreinterpret_u64_u16(res_tt)); + svuint64_t res_odd_high = svzip2_u64(svreinterpret_u64_u16(res_tb), + svreinterpret_u64_u16(res_tt)); + + svuint64_t res_00 = svzip1_u64(res_even_low, res_odd_low); + svuint64_t res_01 = svzip2_u64(res_even_low, res_odd_low); + svuint64_t res_02 = svzip1_u64(res_even_high, res_odd_high); + svuint64_t res_03 = svzip2_u64(res_even_high, res_odd_high); + + svst1_u64(pg_00, reinterpret_cast(&dst[0]), res_00); + svst1_u64(pg_01, reinterpret_cast(&dst[0]) + svcntd(), res_01); + svst1_u64(pg_02, reinterpret_cast(&dst[0]) + 2 * svcntd(), + res_02); + svst1_u64(pg_03, reinterpret_cast(&dst[0]) + 3 * svcntd(), + res_03); + + mapxy += step; + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_element_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Point5ReplicateFourChannels + template class RemapS16Point5ConstantBorder; @@ -808,16 +1235,29 @@ kleidicv_error_t remap_s16point5_sc( if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { svuint16_t sv_width, sv_height, sv_border; - RemapS16Point5ConstantBorder operation{ - src_rows, src_width, src_height, border_value, - sv_src_stride, sv_width, sv_height, sv_border}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5ConstantBorder operation{ + src_rows, src_width, src_height, border_value, + sv_src_stride, sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + // TODO + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); svint16_t sv_xmax, sv_ymax; - RemapS16Point5Replicate operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5Replicate operation{src_rows, src_width, src_height, + sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + RemapS16Point5ReplicateFourChannels operation{ + src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } } return KLEIDICV_OK; } diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 7a18a0bcf..1e8d89b31 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -36,13 +36,17 @@ template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, kleidicv_border_type_t border_type, const ScalarType *border_value) { + // Width is the number of pixels in a row, but Array2D does not handle that + const ptrdiff_t src_width = + static_cast(src.width() / src.channels()); + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { - x = std::clamp(x, 0, static_cast(src.width()) - 1); + x = std::clamp(x, 0, src_width - 1); y = std::clamp(y, 0, static_cast(src.height()) - 1); } else { assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); - if (x >= static_cast(src.width()) || - y >= static_cast(src.height()) || x < 0 || y < 0) { + if (x >= src_width || y >= static_cast(src.height()) || x < 0 || + y < 0) { return border_value; } } @@ -146,12 +150,11 @@ class RemapS16 : public testing::Test { calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, remap_s16()( + source.data(), source.stride(), src_w, + source.height(), actual.data(), actual.stride(), + dst_w, actual.height(), channels, mapxy.data(), + mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -175,12 +178,11 @@ class RemapS16 : public testing::Test { calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, remap_s16()( + source.data(), source.stride(), src_w, + source.height(), actual.data(), actual.stride(), + dst_w, actual.height(), channels, mapxy.data(), + mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -524,8 +526,9 @@ class RemapS16Point5 : public testing::Test { } } - // This part is the same as execute_test() but without initializing source. - // Corner Cases use the biggest possible source. + // This part is the same as execute_test() except source initialization. + // Corner Cases use the biggest possible source, so it is only initializing + // the edges. size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -559,12 +562,12 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, - mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_s16point5()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } @@ -589,12 +592,12 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, - mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_s16point5()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -623,17 +626,17 @@ class RemapS16Point5 : public testing::Test { for (size_t row = 0; row < expected.height(); row++) { for (size_t column = 0; column < expected.width() / src.channels(); ++column) { + // Clang-tidy thinks mapfrac may contain garbage, but it is fully + // initialized at all code paths and the map size always equals dst + // map pixel size + // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) + uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1); + uint8_t y_frac = + (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); + // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) + const int16_t *coords = mapxy.at(row, column * 2); + ptrdiff_t x = coords[0], y = coords[1]; for (size_t ch = 0; ch < src.channels(); ++ch) { - // Clang-tidy thinks mapfrac may contain garbage, but it is fully - // initialized at all code paths and the map size always equals dst - // map pixel size - // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) - uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1); - uint8_t y_frac = - (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); - // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) - const int16_t *coords = mapxy.at(row, column * 2); - int16_t x = coords[0], y = coords[1]; *expected.at(row, column * src.channels() + ch) = lerp2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); @@ -646,48 +649,69 @@ class RemapS16Point5 : public testing::Test { using RemapS16Point5ElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapS16Point5, RemapS16Point5ElementTypes); +template +size_t defaultWidth() { + return 3 * test::Options::vector_lanes() - 1; +} + +size_t defaultHeight() { return 4; } + TYPED_TEST(RemapS16Point5, RandomNoPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 0; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, padding); + TestFixture::test_random(w, h, w, h, 1, border_type, border_value, 0); } } +// TODO: Modify tests to also run constant border once implemented +TYPED_TEST(RemapS16Point5, RandomNoPadding4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 0; + TestFixture::test_random(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, + nullptr, padding); +} + TYPED_TEST(RemapS16Point5, BlendPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 13; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, padding); + TestFixture::test_blend(w, h, w, h, 1, border_type, border_value, 13); } } +TYPED_TEST(RemapS16Point5, BlendPadding4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 7; + TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, + nullptr, padding); +} + TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 13; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, padding); + TestFixture::test_outside_random(w, h, w, h, 1, border_type, border_value, + 13); } } +TYPED_TEST(RemapS16Point5, OutsideRandomPadding4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 11; + TestFixture::test_outside_random( + w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, padding); +} + TYPED_TEST(RemapS16Point5, BlendBigStride) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 16; + size_t src_w = defaultWidth(); + size_t src_h = defaultHeight(); size_t dst_w = src_w; size_t dst_h = src_h; size_t channels = 1; @@ -698,6 +722,16 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) { } } +TYPED_TEST(RemapS16Point5, BlendBigStride4chReplicate) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = + std::numeric_limits::max() / channels - w * channels; + TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, + nullptr, padding); +} + TYPED_TEST(RemapS16Point5, CornerCases) { size_t src_w = std::numeric_limits::max() + 1; size_t src_h = std::numeric_limits::max() + 1; @@ -711,6 +745,18 @@ TYPED_TEST(RemapS16Point5, CornerCases) { } } +TYPED_TEST(RemapS16Point5, CornerCases4ch) { + size_t src_w = 100; + size_t src_h = 8; + size_t dst_w = defaultWidth(); + size_t dst_h = defaultHeight(); + size_t channels = 4; + size_t padding = 17; + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, + padding); +} + TYPED_TEST(RemapS16Point5, NullPointer) { const TypeParam src[4] = {}; TypeParam dst[1]; @@ -819,17 +865,20 @@ TYPED_TEST(RemapS16Point5, UnsupportedBigStride) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(RemapS16Point5, UnsupportedTwoChannels) { +TYPED_TEST(RemapS16Point5, UnsupportedChannels) { const TypeParam src[1] = {}; TypeParam dst[8]; int16_t mapxy[16] = {}; uint16_t mapfrac[8] = {}; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_s16point5()( - src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 2, - mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1, 1, 1, dst, 8, 8, 1, 2, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1, 1, 1, dst, 8, 8, 1, 3, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { @@ -844,6 +893,21 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REFLECT, src)); } +TYPED_TEST(RemapS16Point5, UnsupportedBigStride4ch) { + const TypeParam src[1] = {}; + TypeParam dst[8]; + int16_t mapxy[16] = {}; + uint16_t mapfrac[8] = {}; + + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, + (std::numeric_limits::max() / 4 + 1L) * sizeof(TypeParam), + 1, 1, dst, 8, 8, 1, 4, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_CONSTANT, src)); +} + TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) { const TypeParam src[1] = {}; TypeParam dst[8]; -- GitLab From fe5bd51c8e7439dda6a62cd3b7dc51a20c767a5a Mon Sep 17 00:00:00 2001 From: Richard Wells Date: Tue, 18 Feb 2025 15:14:55 +0000 Subject: [PATCH 2/2] Refactor 4-channel RemapS16Point5 replicate and add constant. --- adapters/opencv/kleidicv_hal.cpp | 18 + conformity/opencv/test_remap.cpp | 4 + .../src/transform/remap_s16point5_neon.cpp | 916 ++++++++++++------ kleidicv/src/transform/remap_sc.h | 28 +- test/api/test_remap.cpp | 79 +- test/api/test_thread.cpp | 26 + 6 files changed, 731 insertions(+), 340 deletions(-) diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index e73c350fa..6c9dbdae7 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1363,6 +1363,15 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, static_cast(dst_width), static_cast(dst_height), 1, mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, border_value.data(), mt)); + } else if (src_type == CV_8UC4) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_s16point5_u8( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 4, + mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, + border_value.data(), mt)); } else if (src_type == CV_16UC1) { auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_s16point5_u16( @@ -1372,6 +1381,15 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, static_cast(dst_width), static_cast(dst_height), 1, mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, border_value.data(), mt)); + } else if (src_type == CV_16UC4) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_s16point5_u16( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 4, + mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, + border_value.data(), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index face699ec..c7bc4f57d 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -250,9 +250,13 @@ std::vector& remap_tests_get() { TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint8 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint8 Constant 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Constant 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapF32 uint8 Replicate Linear", (test_remap_f32), (exec_remap_f32)), TEST("RemapF32 uint16 Replicate Linear", (test_remap_f32), (exec_remap_f32)), diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp index 8ffe4601a..568e13f9b 100644 --- a/kleidicv/src/transform/remap_s16point5_neon.cpp +++ b/kleidicv/src/transform/remap_s16point5_neon.cpp @@ -310,6 +310,43 @@ class RemapS16Point5Replicate { int16x8_t y1_; }; // end of class RemapS16Point5Replicate +inline uint8x8_t interpolate_u8(uint8x8_t v00_narrow, uint8x8_t v01_narrow, + uint8x8_t v10_narrow, uint8x8_t v11_narrow, + uint16x8_t xfrac, uint16x8_t yfrac, + uint16x8_t nxfrac, uint16x8_t nyfrac) { + uint16x8_t v00 = vmovl_u8(v00_narrow); + uint16x8_t v01 = vmovl_u8(v01_narrow); + uint16x8_t v10 = vmovl_u8(v10_narrow); + uint16x8_t v11 = vmovl_u8(v11_narrow); + + auto interpolate_horizontal = [&](uint16x8_t left, uint16x8_t right) { + return vmlaq_u16(vmulq_u16(nxfrac, left), xfrac, right); + }; + + // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint16x4_t top, uint16x4_t bottom, + uint16x4_t frac, uint16x4_t nfrac) { + uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, top, nfrac), bottom, frac); + return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint16x8_t line0 = interpolate_horizontal(v00, v10); + uint16x8_t line1 = interpolate_horizontal(v01, v11); + + uint16x4_t lo = + interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1), + vget_low_u16(yfrac), vget_low_u16(nyfrac)); + uint16x4_t hi = + interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1), + vget_high_u16(yfrac), vget_high_u16(nyfrac)); + + // Discard upper 8 bits of each element and combine low and high parts into + // a single register. + return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi)); +} + template class RemapS16Point5ConstantBorder; @@ -333,13 +370,13 @@ class RemapS16Point5ConstantBorder { auto vector_path = [&](size_t step) { int16x8x2_t xy = vld2q_s16(&mapxy[0]); uint16x8_t frac = vld1q_u16(&mapfrac[0]); - uint8x8_t frac_max = vdup_n_u8(REMAP16POINT5_FRAC_MAX); - uint8x8_t frac_mask = vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1); - uint8x8_t xfrac = vand_u8(vmovn_u16(frac), frac_mask); - uint8x8_t yfrac = - vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); - uint8x8_t nxfrac = vsub_u8(frac_max, xfrac); - uint8x8_t nyfrac = vsub_u8(frac_max, yfrac); + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); + uint16x8_t xfrac = vandq_u16(frac, frac_mask); + uint16x8_t yfrac = + vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); + uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac); + uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac); uint16x8_t one = vdupq_n_u16(1); uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]); @@ -356,8 +393,8 @@ class RemapS16Point5ConstantBorder { uint8x8_t v11 = load_pixels_or_constant_border( src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1); - uint8x8_t result = interpolate(v00, v01, v10, v11, xfrac, vmovl_u8(yfrac), - nxfrac, vmovl_u8(nyfrac)); + uint8x8_t result = + interpolate_u8(v00, v01, v10, v11, xfrac, yfrac, nxfrac, nyfrac); vst1_u8(&dst[0], result); mapxy += ptrdiff_t(step); @@ -414,37 +451,6 @@ class RemapS16Point5ConstantBorder { return vbsl_u8(vmovn_u16(in_range), pixels, v_border_); } - uint8x8_t interpolate(uint8x8_t v00, uint8x8_t v01, uint8x8_t v10, - uint8x8_t v11, uint8x8_t xfrac, uint16x8_t yfrac, - uint8x8_t nxfrac, uint16x8_t nyfrac) { - auto interpolate_horizontal = [&](uint8x8_t left, uint8x8_t right) { - return vmlal_u8(vmull_u8(nxfrac, left), xfrac, right); - }; - - // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. - const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); - - auto interpolate_vertical = [&](uint16x4_t a, uint16x4_t b, uint16x4_t frac, - uint16x4_t nfrac) { - uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, a, nfrac), b, frac); - return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); - }; - - uint16x8_t line0 = interpolate_horizontal(v00, v10); - uint16x8_t line1 = interpolate_horizontal(v01, v11); - - uint16x4_t lo = - interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1), - vget_low_u16(yfrac), vget_low_u16(nyfrac)); - uint16x4_t hi = - interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1), - vget_high_u16(yfrac), vget_high_u16(nyfrac)); - - // Discard upper 8 bits of each element and combine low and high parts into - // a single register. - return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi)); - } - Rows src_rows_; uint16x8_t v_src_stride_; uint16x8_t v_width_; @@ -625,159 +631,554 @@ class RemapS16Point5ConstantBorder { int16x8_t y1_; }; // end of class RemapS16Point5ConstantBorder +inline void get_coordinates(Columns mapxy, + Columns mapfrac, uint16x8_t &x, + uint16x8_t &y, uint8x8_t &xfrac, uint8x8_t &yfrac) { + int16x8x2_t xy = vld2q_s16(&mapxy[0]); + x = xy.val[0]; + y = xy.val[1]; + + uint16x8_t frac = vld1q_u16(&mapfrac[0]); + xfrac = vand_u8(vmovn_u16(frac), vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1)); + yfrac = vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), + vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1)); +} + +inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, + uint16x4_t y1, uint32x4_t &offsets_a, + uint32x4_t &offsets_b, uint32x4_t &offsets_c, + uint32x4_t &offsets_d, + uint16x4_t v_src_element_stride) { + // Multiply by 4 because of channels + uint32x4_t x0_scaled = vshll_n_u16(x0, 2); + uint32x4_t x1_scaled = vshll_n_u16(x1, 2); + + // Calculate offsets from coordinates (y * element_stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride); + offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride); + offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride); + offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride); +} + +inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low, + uint8_t frac_high) { + uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low, + frac_high, frac_high, frac_high, frac_high}; + return vmovl_u8(frac_low_high); +} + +inline uint8x16_t interpolate_u8_4ch( + uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d, + uint8_t xfrac_pixel_0, uint8_t yfrac_pixel_0, uint8_t xfrac_pixel_1, + uint8_t yfrac_pixel_1, uint8_t xfrac_pixel_2, uint8_t yfrac_pixel_2, + uint8_t xfrac_pixel_3, uint8_t yfrac_pixel_3) { + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + + uint16x8_t xfrac_pixels_01 = + create_frac_low_high_u8_4ch(xfrac_pixel_0, xfrac_pixel_1); + uint16x8_t yfrac_pixels_01 = + create_frac_low_high_u8_4ch(yfrac_pixel_0, yfrac_pixel_1); + uint16x8_t nxfrac_pixels_01 = vsubq_u16(frac_max, xfrac_pixels_01); + uint16x8_t nyfrac_pixels_01 = vsubq_u16(frac_max, yfrac_pixels_01); + + uint16x8_t xfrac_pixels_23 = + create_frac_low_high_u8_4ch(xfrac_pixel_2, xfrac_pixel_3); + uint16x8_t yfrac_pixels_23 = + create_frac_low_high_u8_4ch(yfrac_pixel_2, yfrac_pixel_3); + uint16x8_t nxfrac_pixels_23 = vsubq_u16(frac_max, xfrac_pixels_23); + uint16x8_t nyfrac_pixels_23 = vsubq_u16(frac_max, yfrac_pixels_23); + + uint8x8x2_t res; + res.val[0] = interpolate_u8(vget_low_u8(a), vget_low_u8(c), vget_low_u8(b), + vget_low_u8(d), xfrac_pixels_01, yfrac_pixels_01, + nxfrac_pixels_01, nyfrac_pixels_01); + res.val[1] = interpolate_u8(vget_high_u8(a), vget_high_u8(c), vget_high_u8(b), + vget_high_u8(d), xfrac_pixels_23, yfrac_pixels_23, + nxfrac_pixels_23, nyfrac_pixels_23); + + return vcombine(res.val[0], res.val[1]); +} + +inline uint16x8_t interpolate_u16_4ch(uint16x8_t v00, uint16x8_t v01, + uint16x8_t v10, uint16x8_t v11, + uint16_t xfrac_pixel_0, + uint16_t yfrac_pixel_0, + uint16_t xfrac_pixel_1, + uint16_t yfrac_pixel_1) { + auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return vmlal_u16(vmull_u16(nfrac, left), frac, right); + }; + + auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), frac, + nfrac); + }; + + auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), + frac, nfrac); + }; + + // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint32x4_t line0, uint32x4_t line1, + uint32x4_t frac, + uint32x4_t nfrac) -> uint32x4_t { + uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, line0, nfrac), line1, frac); + return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint16x4_t v_xfrac0 = vdup_n_u16(xfrac_pixel_0); + uint16x4_t v_nxfrac0 = vdup_n_u16(REMAP16POINT5_FRAC_MAX - xfrac_pixel_0); + uint16x4_t v_xfrac1 = vdup_n_u16(xfrac_pixel_1); + uint16x4_t v_nxfrac1 = vdup_n_u16(REMAP16POINT5_FRAC_MAX - xfrac_pixel_1); + + uint32x4_t line0_low = + interpolate_horizontal_low(v00, v10, v_xfrac0, v_nxfrac0); + uint32x4_t line1_low = + interpolate_horizontal_low(v01, v11, v_xfrac0, v_nxfrac0); + uint32x4_t line0_high = + interpolate_horizontal_high(v00, v10, v_xfrac1, v_nxfrac1); + uint32x4_t line1_high = + interpolate_horizontal_high(v01, v11, v_xfrac1, v_nxfrac1); + + uint32x4_t v_yfrac0 = vmovl_u16(vdup_n_u16(yfrac_pixel_0)); + uint32x4_t v_nyfrac0 = + vmovl_u16(vdup_n_u16(REMAP16POINT5_FRAC_MAX - yfrac_pixel_0)); + uint32x4_t v_yfrac1 = vmovl_u16(vdup_n_u16(yfrac_pixel_1)); + uint32x4_t v_nyfrac1 = + vmovl_u16(vdup_n_u16(REMAP16POINT5_FRAC_MAX - yfrac_pixel_1)); + + uint32x4_t lo = + interpolate_vertical(line0_low, line1_low, v_yfrac0, v_nyfrac0); + uint32x4_t hi = + interpolate_vertical(line0_high, line1_high, v_yfrac1, v_nyfrac1); + + // Discard upper 16 bits of each element (low the precision back to original + // 16 bits) + return vcombine(vmovn_u32(lo), vmovn_u32(hi)); +} + +inline uint64_t load_32bit(const uint8_t *src) { + uint32_t value = 0; + memcpy(&value, src, sizeof(uint32_t)); + return static_cast(value); +} + +inline uint8x16_t load_4px_4ch(Rows src_rows, + uint32x4_t offsets) { + uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) | + (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32); + uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) | + (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32); + return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23)); +} + +inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns dst) { + vst1q_u8_x2(&dst[0], res); +} + +inline uint16x8_t load_2px_4ch(Rows src_rows, + uint32x2_t offsets) { + return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]), + vld1_u16(&src_rows[vget_lane_u32(offsets, 1)])); +} + +inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns dst) { + vst1q_u16_x4(&dst[0], res); +} + +// Replicate border functions +inline void get_coordinates_replicate(Columns mapxy, + Columns mapfrac, + uint16x8_t &x0, uint16x8_t &y0, + uint16x8_t &x1, uint16x8_t &y1, + uint8x8_t &xfrac, uint8x8_t &yfrac, + int16x8_t v_xmax, int16x8_t v_ymax) { + get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); + + // Zero the xfrac (or yfrac) if x (or y) are below zero + xfrac = + vbsl_u8(vmovn_u16(vcltq_s16(x0, vdupq_n_s16(0))), vdup_n_u8(0), xfrac); + yfrac = + vbsl_u8(vmovn_u16(vcltq_s16(y0, vdupq_n_s16(0))), vdup_n_u8(0), yfrac); + + // Clamp coordinates to within the dimensions of the source image + x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax))); + y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax))); + + // x1 = x0 + 1, except if it's already xmax + x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax)); + y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax)); +} + +inline void load_pixels_u8_4ch_replicate( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b, + uint8x16_t &c, uint8x16_t &d) { + a = load_4px_4ch(src_rows, offsets_a); + b = load_4px_4ch(src_rows, offsets_b); + c = load_4px_4ch(src_rows, offsets_c); + d = load_4px_4ch(src_rows, offsets_d); +} + +inline void load_pixels_u16_4ch_replicate( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo, + uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo, + uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) { + a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); + b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); + c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); + d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); + + a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); + b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); + c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); + d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); +} + +// Constant border functions +inline void get_coordinates_constant( + Columns mapxy, Columns mapfrac, + uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0, + uint16x8_t &x1, uint16x8_t &y1, uint8x8_t &xfrac, uint8x8_t &yfrac, + uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c, + uint16x8_t &in_range_d) { + get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); + + uint16x8_t one = vdupq_n_u16(1); + x1 = vaddq_u16(x0, one); + y1 = vaddq_u16(y0, one); + + uint16x8_t x0_in_range = vcltq_u16(x0, v_width); + uint16x8_t y0_in_range = vcltq_u16(y0, v_height); + uint16x8_t x1_in_range = vcltq_u16(x1, v_width); + uint16x8_t y1_in_range = vcltq_u16(y1, v_height); + + in_range_a = vandq(x0_in_range, y0_in_range); + in_range_b = vandq(x1_in_range, y0_in_range); + in_range_c = vandq(x0_in_range, y1_in_range); + in_range_d = vandq(x1_in_range, y1_in_range); +} + +inline uint32x4_t zero_out_of_range_offsets(uint16x4_t in_range, + uint32x4_t offsets) { + return vbslq_u32(vmovl_u16(in_range), offsets, vdupq_n_u32(0)); +} + +inline uint8x16_t replace_pixel_with_border_u8_4ch(uint16x4_t in_range, + uint8x16_t pixels, + uint8x16_t v_border) { + uint8x16_t in_range_stretched = + vcombine(vmovn_u16(vcombine(vdup_n_u16(vget_lane_u16(in_range, 0)), + vdup_n_u16(vget_lane_u16(in_range, 1)))), + vmovn_u16(vcombine(vdup_n_u16(vget_lane_u16(in_range, 2)), + vdup_n_u16(vget_lane_u16(in_range, 3))))); + + return vbslq_u8(in_range_stretched, pixels, v_border); +} + +inline uint16x8_t replace_pixel_with_border_u16_4ch(uint16_t in_range0, + uint16_t in_range1, + uint16x8_t pixels, + uint16x8_t v_border) { + uint16x8_t in_range_stretched = + vcombine(vdup_n_u16(in_range0), vdup_n_u16(in_range1)); + return vbslq_u16(in_range_stretched, pixels, v_border); +} + +inline void load_pixels_u8_4ch_constant( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x4_t in_range_a, + uint16x4_t in_range_b, uint16x4_t in_range_c, uint16x4_t in_range_d, + uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c, + uint8x16_t &d) { + offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); + offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); + offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); + offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); + + a = load_4px_4ch(src_rows, offsets_a); + b = load_4px_4ch(src_rows, offsets_b); + c = load_4px_4ch(src_rows, offsets_c); + d = load_4px_4ch(src_rows, offsets_d); + + a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border); + b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border); + c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border); + d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border); +} + +inline void load_pixels_u16_4ch_constant( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x4_t in_range_a, + uint16x4_t in_range_b, uint16x4_t in_range_c, uint16x4_t in_range_d, + uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo, + uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo, + uint16x8_t &d_hi) { + offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); + offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); + offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); + offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); + + a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); + b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); + c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); + d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); + + a_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_a, 0), + vget_lane_u16(in_range_a, 1), a_lo, + v_border); + b_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_b, 0), + vget_lane_u16(in_range_b, 1), b_lo, + v_border); + c_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_c, 0), + vget_lane_u16(in_range_c, 1), c_lo, + v_border); + d_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_d, 0), + vget_lane_u16(in_range_d, 1), d_lo, + v_border); + + a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); + b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); + c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); + d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); + + a_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_a, 2), + vget_lane_u16(in_range_a, 3), a_hi, + v_border); + b_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_b, 2), + vget_lane_u16(in_range_b, 3), b_hi, + v_border); + c_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_c, 2), + vget_lane_u16(in_range_c, 3), c_hi, + v_border); + d_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_d, 2), + vget_lane_u16(in_range_d, 3), d_hi, + v_border); +} + template -class RemapS16Point5ReplicateFourChannels; +class RemapS16Point5Replicate4ch; template <> -class RemapS16Point5ReplicateFourChannels { +class RemapS16Point5Replicate4ch { public: using ScalarType = uint8_t; using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; - using MapVector2Type = typename MapVecTraits::Vector2Type; - using FracVecTraits = neon::VecTraits; - using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5ReplicateFourChannels(Rows src_rows, - size_t src_width, size_t src_height) + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height) : src_rows_{src_rows}, v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} - void get_map_coordinates(Columns mapxy, - Columns mapfrac, uint16x8_t &x0, - uint16x8_t &y0, uint16x8_t &x1, uint16x8_t &y1, - uint16x8_t &xfrac, uint16x8_t &yfrac) { - MapVector2Type xy = vld2q_s16(&mapxy[0]); - FracVectorType frac = vld1q_u16(&mapfrac[0]); - xfrac = vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), - vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); - yfrac = vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), - vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), - vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, + v_xmax_, v_ymax_); - // Clamp coordinates to within the dimensions of the source image - x0 = vreinterpretq_u16_s16( - vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); - y0 = vreinterpretq_u16_s16( - vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint8x16_t a, b, c, d; + uint8x16x2_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a, b, c, d); + res.val[0] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1), + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a, b, c, d); + res.val[1] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5), + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u8_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; - // x1 = x0 + 1, except if it's already xmax - x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_)); - y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_)); + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); } - void get_offsets(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, uint16x4_t y1, - uint32x4_t &offsets_a, uint32x4_t &offsets_b, - uint32x4_t &offsets_c, uint32x4_t &offsets_d) { - // Multiply by 4 because of channels - uint32x4_t x0_scaled = vshll_n_u16(x0, 2); - uint32x4_t x1_scaled = vshll_n_u16(x1, 2); + private: + Rows src_rows_; + uint16x4_t v_src_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5Replicate4ch - // Calculate offsets from coordinates (y * stride + x) - // a: top left, b: top right, c: bottom left, d: bottom right - offsets_a = vmlal_u16(x0_scaled, y0, v_src_stride_); - offsets_b = vmlal_u16(x1_scaled, y0, v_src_stride_); - offsets_c = vmlal_u16(x0_scaled, y1, v_src_stride_); - offsets_d = vmlal_u16(x1_scaled, y1, v_src_stride_); - }; +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; - uint16x4_t interpolate(uint16x4_t a, uint16x4_t b, uint16x4_t c, uint16x4_t d, - uint16_t xfrac, uint16_t yfrac) { - uint16x4_t line0 = - vmla_n_u16(vmul_n_u16(b, xfrac), a, REMAP16POINT5_FRAC_MAX - xfrac); - uint32x4_t line0_lerpd = - vmlal_n_u16(vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, - REMAP16POINT5_FRAC_MAX - yfrac); - uint16x4_t line1 = - vmla_n_u16(vmul_n_u16(d, xfrac), c, REMAP16POINT5_FRAC_MAX - xfrac); - return vshrn_n_u32(vmlal_n_u16(line0_lerpd, line1, yfrac), - 2 * REMAP16POINT5_FRAC_BITS); - }; + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} - uint64_t load_32bit(const ScalarType *src) { - uint32_t value; - memcpy(&value, src, sizeof(uint32_t)); - return static_cast(value); - }; + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, + v_xmax_, v_ymax_); - uint16x8_t load_64bit(uint32_t offset_low, uint32_t offset_high) { - uint64_t acc = load_32bit(&src_rows_[offset_low]) | - (load_32bit(&src_rows_[offset_high]) << 32); - return vmovl_u8(vset_lane_u64(acc, vdup_n_u64(0), 0)); - }; + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; + uint16x8x4_t res; + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a_low, a_high, b_low, b_high, + c_low, c_high, d_low, d_high); + res.val[0] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1)); + res.val[1] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a_low, a_high, b_low, b_high, + c_low, c_high, d_low, d_high); + res.val[2] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5)); + res.val[3] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u16_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; - uint16x8_t load_01(uint32x4_t offsets) { - return load_64bit(vgetq_lane_u32(offsets, 0), vgetq_lane_u32(offsets, 1)); - }; + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } - uint16x8_t load_23(uint32x4_t offsets) { - return load_64bit(vgetq_lane_u32(offsets, 2), vgetq_lane_u32(offsets, 3)); - }; + private: + Rows src_rows_; + uint16x4_t v_src_element_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5Replicate4ch - uint8x16_t load_and_interpolate(uint32x4_t offsets_a, uint32x4_t offsets_b, - uint32x4_t offsets_c, uint32x4_t offsets_d, - uint16x4_t xfrac, uint16x4_t yfrac) { - uint16x8_t a = load_01(offsets_a); - uint16x8_t b = load_01(offsets_b); - uint16x8_t c = load_01(offsets_c); - uint16x8_t d = load_01(offsets_d); - - uint16x4x4_t res; - res.val[0] = interpolate(vget_low_u16(a), vget_low_u16(b), vget_low_u16(c), - vget_low_u16(d), vget_lane_u16(xfrac, 0), - vget_lane_u16(yfrac, 0)); - res.val[1] = interpolate(vget_high_u16(a), vget_high_u16(b), - vget_high_u16(c), vget_high_u16(d), - vget_lane_u16(xfrac, 1), vget_lane_u16(yfrac, 1)); - - a = load_23(offsets_a); - b = load_23(offsets_b); - c = load_23(offsets_c); - d = load_23(offsets_d); - - res.val[2] = interpolate(vget_low_u16(a), vget_low_u16(b), vget_low_u16(c), - vget_low_u16(d), vget_lane_u16(xfrac, 2), - vget_lane_u16(yfrac, 2)); - res.val[3] = interpolate(vget_high_u16(a), vget_high_u16(b), - vget_high_u16(c), vget_high_u16(d), - vget_lane_u16(xfrac, 3), vget_lane_u16(yfrac, 3)); - - return vuzp1q_u8(vcombine(res.val[0], res.val[1]), - vcombine(res.val[2], res.val[3])); - } +template +class RemapS16Point5Constant4ch; - void store_pixels(uint8x16_t res_low, uint8x16_t res_high, - Columns dst) { - uint8x16x2_t res; - res.val[0] = res_low; - res.val[1] = res_high; - vst1q_u8_x2(&dst[0], res); - } +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdupq_n_u8(*border_value)} {} void process_row(size_t width, Columns mapxy, Columns mapfrac, Columns dst) { auto vector_path = [&](size_t step) { - uint16x8_t x0, y0, x1, y1, xfrac, yfrac; - get_map_coordinates(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac); + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; + get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, + y1, xfrac, yfrac, in_range_a, in_range_b, + in_range_c, in_range_d); uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; - - get_offsets(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), - vget_low_u16(y1), offsets_a, offsets_b, offsets_c, offsets_d); - uint8x16_t res_low = - load_and_interpolate(offsets_a, offsets_b, offsets_c, offsets_d, - vget_low_u16(xfrac), vget_low_u16(yfrac)); - - get_offsets(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), - vget_high_u16(y1), offsets_a, offsets_b, offsets_c, - offsets_d); - uint8x16_t res_high = - load_and_interpolate(offsets_a, offsets_b, offsets_c, offsets_d, - vget_high_u16(xfrac), vget_high_u16(yfrac)); - - store_pixels(res_low, res_high, dst); + uint8x16_t a, b, c, d; + uint8x16x2_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + + load_pixels_u8_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_low_u16(in_range_a), vget_low_u16(in_range_b), + vget_low_u16(in_range_c), vget_low_u16(in_range_d), v_border_, a, b, + c, d); + res.val[0] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1), + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_high_u16(in_range_a), vget_high_u16(in_range_b), + vget_high_u16(in_range_c), vget_high_u16(in_range_d), v_border_, a, b, + c, d); + res.val[1] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5), + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u8_4ch(res, dst); mapxy += ptrdiff_t(step); mapfrac += ptrdiff_t(step); dst += ptrdiff_t(step); @@ -796,144 +1197,80 @@ class RemapS16Point5ReplicateFourChannels { private: Rows src_rows_; uint16x4_t v_src_stride_; - int16x8_t v_xmax_; - int16x8_t v_ymax_; -}; // end of class RemapS16Point5ReplicateFourChannels + uint16x8_t v_width_; + uint16x8_t v_height_; + uint8x16_t v_border_; +}; // end of class RemapS16Point5Constant4ch -// TODO: Refactor this to match the uint8_t layout template <> -class RemapS16Point5ReplicateFourChannels { +class RemapS16Point5Constant4ch { public: using ScalarType = uint16_t; using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; - using MapVector2Type = typename MapVecTraits::Vector2Type; - using FracVecTraits = neon::VecTraits; - using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5ReplicateFourChannels(Rows src_rows, - size_t src_width, size_t src_height) + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) : src_rows_{src_rows}, v_src_element_stride_{vdup_n_u16( static_cast(src_rows_.stride() / sizeof(ScalarType)))}, - v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, - v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdupq_n_u16(*border_value)} {} void process_row(size_t width, Columns mapxy, Columns mapfrac, Columns dst) { auto vector_path = [&](size_t step) { - MapVector2Type xy = vld2q_s16(&mapxy[0]); - FracVectorType frac = vld1q_u16(&mapfrac[0]); - uint16x8_t xfrac = - vbslq_u16(vcltq_s16(xy.val[0], vdupq_n_s16(0)), vdupq_n_u16(0), - vandq_u16(frac, vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); - uint16x8_t yfrac = - vbslq_u16(vcltq_s16(xy.val[1], vdupq_n_s16(0)), vdupq_n_u16(0), - vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), - vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1))); - - // Clamp coordinates to within the dimensions of the source image - uint16x8_t x0 = vreinterpretq_u16_s16( - vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); - uint16x8_t y0 = vreinterpretq_u16_s16( - vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); - - // x1 = x0 + 1, except if it's already xmax - uint16x8_t x1 = vsubq_u16(x0, vcltq_s16(xy.val[0], v_xmax_)); - uint16x8_t y1 = vsubq_u16(y0, vcltq_s16(xy.val[1], v_ymax_)); + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; + get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, + y1, xfrac, yfrac, in_range_a, in_range_b, + in_range_c, in_range_d); - auto load_16x4 = [&](uint32_t offset) { - return vld1_u64(reinterpret_cast(&src_rows_[offset])); - }; - - uint16_t xfrac_array[8], yfrac_array[8]; - vst1q_u16(xfrac_array, xfrac); - vst1q_u16(yfrac_array, yfrac); - - auto interpolate = [xfrac_array, yfrac_array](uint16x4_t a, uint16x4_t b, - uint16x4_t c, uint16x4_t d, - size_t index) { - uint32x4_t line0 = - vmlal_n_u16(vmull_n_u16(b, xfrac_array[index]), a, - REMAP16POINT5_FRAC_MAX - xfrac_array[index]); - uint32x4_t line0_lerpd = vmlaq_n_u32( - vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2), line0, - static_cast(REMAP16POINT5_FRAC_MAX - yfrac_array[index])); - uint32x4_t line1 = - vmlal_n_u16(vmull_n_u16(d, xfrac_array[index]), c, - REMAP16POINT5_FRAC_MAX - xfrac_array[index]); - return vshrn_n_u32( - vmlaq_n_u32(line0_lerpd, line1, - static_cast(yfrac_array[index])), - 2 * REMAP16POINT5_FRAC_BITS); - }; - -#define LOAD_AND_INTERPOLATE(a, b, c, d, index1, index2) \ - interpolate(load_16x4(vgetq_lane_u32((a), (index1))), \ - load_16x4(vgetq_lane_u32((b), (index1))), \ - load_16x4(vgetq_lane_u32((c), (index1))), \ - load_16x4(vgetq_lane_u32((d), (index1))), (index2)) - - // Calculate offsets from coordinates (y * element_stride + x) - // a: top left, b: top right, c: bottom left, d: bottom right + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; uint16x8x4_t res; - { - // multiply by 4 because of channels - uint32x4_t x0_low = vshll_n_u16(vget_low_u16(x0), 2); - uint32x4_t x1_low = vshll_n_u16(vget_low_u16(x1), 2); - uint32x4_t offsets_a = - vmlal_u16(x0_low, vget_low_u16(y0), v_src_element_stride_); - uint32x4_t offsets_b = - vmlal_u16(x1_low, vget_low_u16(y0), v_src_element_stride_); - uint32x4_t offsets_c = - vmlal_u16(x0_low, vget_low_u16(y1), v_src_element_stride_); - uint32x4_t offsets_d = - vmlal_u16(x1_low, vget_low_u16(y1), v_src_element_stride_); - - uint16x4_t res0 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 0, 0); - uint16x4_t res1 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 1, 1); - uint16x4_t res2 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 2, 2); - uint16x4_t res3 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 3, 3); - - res.val[0] = vcombine(res0, res1); - res.val[1] = vcombine(res2, res3); - } - - { - // multiply by 4 because of channels - uint32x4_t x0_high = vshll_high_n_u16(x0, 2); - uint32x4_t x1_high = vshll_high_n_u16(x1, 2); - uint32x4_t offsets_a = - vmlal_u16(x0_high, vget_high_u16(y0), v_src_element_stride_); - uint32x4_t offsets_b = - vmlal_u16(x1_high, vget_high_u16(y0), v_src_element_stride_); - uint32x4_t offsets_c = - vmlal_u16(x0_high, vget_high_u16(y1), v_src_element_stride_); - uint32x4_t offsets_d = - vmlal_u16(x1_high, vget_high_u16(y1), v_src_element_stride_); - - uint16x4_t res0 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 0, 4); - uint16x4_t res1 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 1, 5); - uint16x4_t res2 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 2, 6); - uint16x4_t res3 = LOAD_AND_INTERPOLATE(offsets_a, offsets_b, offsets_c, - offsets_d, 3, 7); - - res.val[2] = vcombine(res0, res1); - res.val[3] = vcombine(res2, res3); - } - - vst1q_u16_x4(&dst[0], res); + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_low_u16(in_range_a), vget_low_u16(in_range_b), + vget_low_u16(in_range_c), vget_low_u16(in_range_d), v_border_, a_low, + a_high, b_low, b_high, c_low, c_high, d_low, d_high); + res.val[0] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1)); + res.val[1] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_high_u16(in_range_a), vget_high_u16(in_range_b), + vget_high_u16(in_range_c), vget_high_u16(in_range_d), v_border_, + a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high); + res.val[2] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5)); + res.val[3] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u16_4ch(res, dst); mapxy += ptrdiff_t(step); mapfrac += ptrdiff_t(step); dst += ptrdiff_t(step); }; + LoopUnroll loop{width, MapVecTraits::num_lanes()}; loop.unroll_once(vector_path); ptrdiff_t back_step = static_cast(loop.step()) - @@ -947,9 +1284,10 @@ class RemapS16Point5ReplicateFourChannels { private: Rows src_rows_; uint16x4_t v_src_element_stride_; - int16x8_t v_xmax_; - int16x8_t v_ymax_; -}; // end of class RemapS16Point5ReplicateFourChannels + uint16x8_t v_width_; + uint16x8_t v_height_; + uint16x8_t v_border_; +}; // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) @@ -988,8 +1326,9 @@ kleidicv_error_t remap_s16point5( zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } else { assert(channels == 4); - // TODO - return KLEIDICV_ERROR_NOT_IMPLEMENTED; + RemapS16Point5Constant4ch operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); @@ -998,8 +1337,7 @@ kleidicv_error_t remap_s16point5( zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } else { assert(channels == 4); - RemapS16Point5ReplicateFourChannels operation{src_rows, src_width, - src_height}; + RemapS16Point5Replicate4ch operation{src_rows, src_width, src_height}; zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } } diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index 8bf246474..269757fec 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -581,10 +581,10 @@ class RemapS16Point5Replicate { }; // end of class RemapS16Point5Replicate template -class RemapS16Point5ReplicateFourChannels; +class RemapS16Point5Replicate4ch; template <> -class RemapS16Point5ReplicateFourChannels { +class RemapS16Point5Replicate4ch { public: using ScalarType = uint8_t; using MapVecTraits = VecTraits; @@ -593,11 +593,9 @@ class RemapS16Point5ReplicateFourChannels { using FracVecTraits = VecTraits; using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5ReplicateFourChannels(Rows src_rows, - size_t src_width, size_t src_height, - svuint16_t& v_src_stride, - MapVectorType& v_x_max, - MapVectorType& v_y_max) + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) : src_rows_{src_rows}, v_src_stride_{v_src_stride}, v_xmax_{v_x_max}, @@ -751,10 +749,10 @@ class RemapS16Point5ReplicateFourChannels { svuint16_t& v_src_stride_; MapVectorType& v_xmax_; MapVectorType& v_ymax_; -}; // end of class RemapS16Point5ReplicateFourChannels +}; // end of class RemapS16Point5Replicate4ch template <> -class RemapS16Point5ReplicateFourChannels { +class RemapS16Point5Replicate4ch { public: using ScalarType = uint16_t; using MapVecTraits = VecTraits; @@ -763,11 +761,9 @@ class RemapS16Point5ReplicateFourChannels { using FracVecTraits = VecTraits; using FracVectorType = typename FracVecTraits::VectorType; - RemapS16Point5ReplicateFourChannels(Rows src_rows, - size_t src_width, size_t src_height, - svuint16_t& v_src_stride, - MapVectorType& v_x_max, - MapVectorType& v_y_max) + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) : src_rows_{src_rows}, v_src_element_stride_{v_src_stride}, v_xmax_{v_x_max}, @@ -1005,7 +1001,7 @@ class RemapS16Point5ReplicateFourChannels { svuint16_t& v_src_element_stride_; MapVectorType& v_xmax_; MapVectorType& v_ymax_; -}; // end of class RemapS16Point5ReplicateFourChannels +}; // end of class RemapS16Point5Replicate4ch template class RemapS16Point5ConstantBorder; @@ -1254,7 +1250,7 @@ kleidicv_error_t remap_s16point5_sc( zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } else { assert(channels == 4); - RemapS16Point5ReplicateFourChannels operation{ + RemapS16Point5Replicate4ch operation{ src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax}; zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); } diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 1e8d89b31..a7bcc8222 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -35,7 +35,8 @@ KLEIDICV_REMAP_F32(uint16_t, u16); template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, - kleidicv_border_type_t border_type, const ScalarType *border_value) { + ptrdiff_t ch, kleidicv_border_type_t border_type, + const ScalarType *border_value) { // Width is the number of pixels in a row, but Array2D does not handle that const ptrdiff_t src_width = static_cast(src.width() / src.channels()); @@ -50,7 +51,7 @@ static const ScalarType *get_array2d_element_or_border( return border_value; } } - return src.at(y, x * src.channels()); + return src.at(y, x * src.channels() + ch); } template @@ -191,9 +192,9 @@ class RemapS16 : public testing::Test { kleidicv_border_type_t border_type, const ScalarType *border_value, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -202,7 +203,7 @@ class RemapS16 : public testing::Test { for (size_t ch = 0; ch < src.channels(); ++ch) { const int16_t *coords = mapxy.at(row, column * 2); int16_t x = coords[0], y = coords[1]; - *expected.at(row, column * src.channels() + ch) = get_src(x, y)[ch]; + *expected.at(row, column * src.channels() + ch) = *get_src(x, y, ch); } } } @@ -618,9 +619,9 @@ class RemapS16Point5 : public testing::Test { kleidicv_border_type_t border_type, const ScalarType *border_value, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -638,8 +639,8 @@ class RemapS16Point5 : public testing::Test { ptrdiff_t x = coords[0], y = coords[1]; for (size_t ch = 0; ch < src.channels(); ++ch) { *expected.at(row, column * src.channels() + ch) = - lerp2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], - get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); + lerp2d(x_frac, y_frac, *get_src(x, y, ch), *get_src(x + 1, y, ch), + *get_src(x, y + 1, ch), *get_src(x + 1, y + 1, ch)); } } } @@ -664,14 +665,15 @@ TYPED_TEST(RemapS16Point5, RandomNoPadding) { } } -// TODO: Modify tests to also run constant border once implemented -TYPED_TEST(RemapS16Point5, RandomNoPadding4chReplicate) { +TYPED_TEST(RemapS16Point5, RandomNoPadding4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = 0; - TestFixture::test_random(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_random(w, h, w, h, channels, border_type, border_value, + padding); + } } TYPED_TEST(RemapS16Point5, BlendPadding) { @@ -682,13 +684,15 @@ TYPED_TEST(RemapS16Point5, BlendPadding) { } } -TYPED_TEST(RemapS16Point5, BlendPadding4chReplicate) { +TYPED_TEST(RemapS16Point5, BlendPadding4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = 7; - TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(w, h, w, h, channels, border_type, border_value, + padding); + } } TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { @@ -700,13 +704,15 @@ TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { } } -TYPED_TEST(RemapS16Point5, OutsideRandomPadding4chReplicate) { +TYPED_TEST(RemapS16Point5, OutsideRandomPadding4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = 11; - TestFixture::test_outside_random( - w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_outside_random(w, h, w, h, channels, border_type, + border_value, padding); + } } TYPED_TEST(RemapS16Point5, BlendBigStride) { @@ -722,14 +728,16 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) { } } -TYPED_TEST(RemapS16Point5, BlendBigStride4chReplicate) { +TYPED_TEST(RemapS16Point5, BlendBigStride4ch) { size_t w = defaultWidth(); size_t h = defaultHeight(); size_t channels = 4; size_t padding = std::numeric_limits::max() / channels - w * channels; - TestFixture::test_blend(w, h, w, h, channels, KLEIDICV_BORDER_TYPE_REPLICATE, - nullptr, padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(w, h, w, h, channels, border_type, border_value, + padding); + } } TYPED_TEST(RemapS16Point5, CornerCases) { @@ -752,9 +760,10 @@ TYPED_TEST(RemapS16Point5, CornerCases4ch) { size_t dst_h = defaultHeight(); size_t channels = 4; size_t padding = 17; - TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr, - padding); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); + } } TYPED_TEST(RemapS16Point5, NullPointer) { @@ -1145,9 +1154,9 @@ class RemapF32 : public testing::Test { const ScalarType *border_value, kleidicv_interpolation_type_t interpolation, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -1168,10 +1177,10 @@ class RemapF32 : public testing::Test { float xfrac = x - std::floor(x); float yfrac = y - std::floor(y); for (size_t ch = 0; ch < src.channels(); ++ch) { - float a = get_src(ix, iy)[ch]; - float b = get_src(ix + 1, iy)[ch]; - float c = get_src(ix, iy + 1)[ch]; - float d = get_src(ix + 1, iy + 1)[ch]; + float a = *get_src(ix, iy, ch); + float b = *get_src(ix + 1, iy, ch); + float c = *get_src(ix, iy + 1, ch); + float d = *get_src(ix + 1, iy + 1, ch); float line1 = (b - a) * xfrac + a; float line2 = (d - c) * xfrac + c; float float_result = (line2 - line1) * yfrac + line1; @@ -1190,7 +1199,7 @@ class RemapF32 : public testing::Test { static_cast(KLEIDICV_MAX_IMAGE_PIXELS)))); for (size_t ch = 0; ch < src.channels(); ++ch) { *expected.at(row, column * src.channels() + ch) = - get_src(ix, iy)[ch]; + *get_src(ix, iy, ch); } } } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 128cf7af7..e7214531d 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -722,12 +722,24 @@ TEST_P(Thread, remap_s16point5_u8_border_replicate) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u8_border_replicate_4ch) { + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + TEST_P(Thread, remap_s16point5_u16_border_replicate) { check_remap_s16point5(kleidicv_remap_s16point5_u16, kleidicv_thread_remap_s16point5_u16, 1, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u16_border_replicate_4ch) { + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + TEST_P(Thread, remap_s16point5_u8_border_constant) { const uint8_t border_value = 0; check_remap_s16point5(kleidicv_remap_s16point5_u8, @@ -735,6 +747,13 @@ TEST_P(Thread, remap_s16point5_u8_border_constant) { KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } +TEST_P(Thread, remap_s16point5_u8_border_constant_4ch) { + const uint8_t border_value = 0; + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 4, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + TEST_P(Thread, remap_s16point5_u16_border_constant) { const uint16_t border_value = 0; check_remap_s16point5(kleidicv_remap_s16point5_u16, @@ -742,6 +761,13 @@ TEST_P(Thread, remap_s16point5_u16_border_constant) { KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } +TEST_P(Thread, remap_s16point5_u16_border_constant_4ch) { + const uint16_t border_value = 0; + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 4, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + TEST_P(Thread, remap_s16point5_u8_not_implemented) { const uint8_t border_value = 0; check_remap_s16point5_not_implemented( -- GitLab