From 1629fdf91e7a7a971abd8db8b03376ff84846719 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Tue, 25 Feb 2025 15:53:05 +0000 Subject: [PATCH] Simplify, beautify and unify common parts of Remap_F32 and WarpPerspective Zero-size (width or height) source images are not supported anymore. Division by zero is now not replaced with 0, but is left NaN. --- kleidicv/include/kleidicv/kleidicv.h | 3 +- kleidicv/include/kleidicv/transform/remap.h | 14 - kleidicv/src/transform/remap_f32_neon.cpp | 1236 ++--------------- kleidicv/src/transform/remap_f32_sve2.cpp | 321 +++++ kleidicv/src/transform/remap_s16_sve2.cpp | 277 ++++ .../{remap_sc.h => remap_s16point5_sve2.cpp} | 568 +------- kleidicv/src/transform/remap_sve2.cpp | 84 -- kleidicv/src/transform/transform_common.h | 54 + kleidicv/src/transform/transform_neon.h | 256 ++++ .../{common_sc.h => transform_sve2.h} | 126 +- .../src/transform/warp_perspective_neon.cpp | 386 +---- kleidicv/src/transform/warp_perspective_sc.h | 299 ---- .../src/transform/warp_perspective_sve2.cpp | 270 +++- test/api/test_remap.cpp | 14 +- test/api/test_warp_perspective.cpp | 40 +- 15 files changed, 1426 insertions(+), 2522 deletions(-) create mode 100644 kleidicv/src/transform/remap_f32_sve2.cpp create mode 100644 kleidicv/src/transform/remap_s16_sve2.cpp rename kleidicv/src/transform/{remap_sc.h => remap_s16point5_sve2.cpp} (50%) delete mode 100644 kleidicv/src/transform/remap_sve2.cpp create mode 100644 kleidicv/src/transform/transform_common.h create mode 100644 kleidicv/src/transform/transform_neon.h rename kleidicv/src/transform/{common_sc.h => transform_sve2.h} (60%) delete mode 100644 kleidicv/src/transform/warp_perspective_sc.h diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 294414163..2bed6acfd 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1840,7 +1840,8 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u16, const uint16_t *src, /// Width and height are the same for `mapx`, `mapy` and for `dst`. `src` /// dimensions may be different, but due to the limits of 32-bit float format, /// its width and height must be less than 2^24. Coordinates outside of `src` -/// dimensions are considered border. +/// dimensions are considered border. Zero width or height `src` is not +/// supported. /// /// @param src Pointer to the source data. Must be non-null. /// @param src_stride Distance in bytes from the start of one row to the diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index dcf43bd64..15b1654b6 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -142,20 +142,6 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, const T *border_value); } // namespace sve2 -namespace sme2 { - -template -kleidicv_error_t remap_s16point5(const T *src, size_t src_stride, - size_t src_width, size_t src_height, T *dst, - size_t dst_stride, size_t dst_width, - size_t dst_height, size_t channels, - const int16_t *mapxy, size_t mapxy_stride, - const uint16_t *mapfrac, size_t mapfrac_stride, - kleidicv_border_type_t border_type, - const T *border_value); - -} // namespace sme2 - } // namespace kleidicv #endif // KLEIDICV_REMAP_REMAP_H diff --git a/kleidicv/src/transform/remap_f32_neon.cpp b/kleidicv/src/transform/remap_f32_neon.cpp index ccfec490c..18f517c14 100644 --- a/kleidicv/src/transform/remap_f32_neon.cpp +++ b/kleidicv/src/transform/remap_f32_neon.cpp @@ -2,1094 +2,151 @@ // // SPDX-License-Identifier: Apache-2.0 +#include + #include -#include -#include "kleidicv/kleidicv.h" +#include "kleidicv/ctypes.h" #include "kleidicv/neon.h" #include "kleidicv/transform/remap.h" +#include "transform_neon.h" namespace kleidicv::neon { template -class RemapF32Replicate; - -template -class RemapF32Replicate { - public: - using ScalarType = uint8_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32Replicate(Rows src_rows, size_t src_width, - size_t src_height) - : src_rows_{src_rows}, - v_src_stride_{vdup_n_u32(static_cast(src_rows_.stride()))}, - vq_src_stride_{vdupq_n_u32(static_cast(src_rows_.stride()))}, - v_xmax_{vdupq_n_u32(static_cast(src_width - 1))}, - v_ymax_{vdupq_n_u32(static_cast(src_height - 1))} {} - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { - uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride_); - uint64_t acc = - static_cast(src_rows_[vgetq_lane_u32(offset, 0)]) | - (static_cast(src_rows_[vgetq_lane_u32(offset, 1)]) << 32); - uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = static_cast(src_rows_[vgetq_lane_u32(offset, 2)]) | - (static_cast(src_rows_[vgetq_lane_u32(offset, 3)]) << 32); - rawsrc = vsetq_lane_u64(acc, rawsrc, 1); - return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); - }; - - auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) { - uint64x2_t offset_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride_); - uint64x2_t offset_high = - vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride_); - uint64_t acc = - static_cast(src_rows_[vgetq_lane_u64(offset_low, 0)]) | - (static_cast(src_rows_[vgetq_lane_u64(offset_low, 1)]) - << 32); - uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = static_cast(src_rows_[vgetq_lane_u64(offset_high, 0)]) | - (static_cast(src_rows_[vgetq_lane_u64(offset_high, 1)]) - << 32); - rawsrc = vsetq_lane_u64(acc, rawsrc, 1); - return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); - }; - - auto load = [&](uint32x4_t x, uint32x4_t y) { - if constexpr (IsLarge) { - return load_src_into_floats_large(x, y); - } else { - return load_src_into_floats_small(x, y); - } - }; - - auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { - MapVectorType x = vld1q_f32(ptr_mapx); - MapVectorType y = vld1q_f32(ptr_mapy); - // Truncating convert to int - uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_); - uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_); - - // Get fractional part, or 0 if out of range - float32x4_t zero = vdupq_n_f32(0.F); - uint32x4_t x_in_range = - vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_)); - uint32x4_t y_in_range = - vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_)); - float32x4_t xfrac = - vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero); - float32x4_t yfrac = - vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero); - - // x1 = x0 + 1, except if it's already xmax or out of range - uint32x4_t x1 = vsubq_u32(x0, x_in_range); - uint32x4_t y1 = vsubq_u32(y0, y_in_range); - - // Calculate offsets from coordinates (y * stride + x) - // a: top left, b: top right, c: bottom left, d: bottom right - float32x4_t a = load(x0, y0); - float32x4_t b = load(x1, y0); - float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); - float32x4_t c = load(x0, y1); - float32x4_t d = load(x1, y1); - float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); - float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); - return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result)); - }; - - auto vector_path_4 = [&](size_t step) { // step = 4*4 = 16 - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); - - ptr_mapx += kStep; - ptr_mapy += kStep; - uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); - uint16x8_t result16_0 = vuzp1q_u16(res0, res1); - - ptr_mapx += kStep; - ptr_mapy += kStep; - res0 = vector_path_1(ptr_mapx, ptr_mapy); - - ptr_mapx += kStep; - ptr_mapy += kStep; - res1 = vector_path_1(ptr_mapx, ptr_mapy); - uint16x8_t result16_1 = vuzp1q_u16(res0, res1); - vst1q_u8(&dst[0], vuzp1q_u8(result16_0, result16_1)); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_four_times(vector_path_4); - loop.unroll_once([&](size_t step) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - dst[0] = vgetq_lane_u32(result, 0); - dst[1] = vgetq_lane_u32(result, 1); - dst[2] = vgetq_lane_u32(result, 2); - dst[3] = vgetq_lane_u32(result, 3); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - dst[0] = vgetq_lane_u32(result, 0); - dst[1] = vgetq_lane_u32(result, 1); - dst[2] = vgetq_lane_u32(result, 2); - dst[3] = vgetq_lane_u32(result, 3); - }); - } - - private: - Rows src_rows_; - uint32x2_t v_src_stride_; // load_large - uint32x4_t vq_src_stride_; // load_small - uint32x4_t v_xmax_; - uint32x4_t v_ymax_; -}; // end of class RemapF32Replicate - -template -class RemapF32Replicate { - public: - using ScalarType = uint16_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32Replicate(Rows src_rows, size_t src_width, - size_t src_height) - : src_rows_{src_rows}, - v_src_element_stride_{vdup_n_u32( - static_cast(src_rows_.stride() / sizeof(ScalarType)))}, - vq_src_element_stride_{vdupq_n_u32( - static_cast(src_rows_.stride() / sizeof(ScalarType)))}, - v_xmax_{vdupq_n_u32(static_cast(src_width - 1))}, - v_ymax_{vdupq_n_u32(static_cast(src_height - 1))} {} - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { - uint32x4_t offset = vmlaq_u32(x, y, vq_src_element_stride_); - uint64_t acc = - static_cast(src_rows_[vgetq_lane_u32(offset, 0)]) | - (static_cast(src_rows_[vgetq_lane_u32(offset, 1)]) << 32); - uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = static_cast(src_rows_[vgetq_lane_u32(offset, 2)]) | - (static_cast(src_rows_[vgetq_lane_u32(offset, 3)]) << 32); - rawsrc = vsetq_lane_u64(acc, rawsrc, 1); - return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); - }; - - auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) { - uint64x2_t offset_low = vmlal_u32(vmovl_u32(vget_low_u32(x)), - vget_low_u32(y), v_src_element_stride_); - uint64x2_t offset_high = - vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_element_stride_); - uint64_t acc = - static_cast(src_rows_[vgetq_lane_u64(offset_low, 0)]) | - (static_cast(src_rows_[vgetq_lane_u64(offset_low, 1)]) - << 32); - uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = static_cast(src_rows_[vgetq_lane_u64(offset_high, 0)]) | - (static_cast(src_rows_[vgetq_lane_u64(offset_high, 1)]) - << 32); - rawsrc = vsetq_lane_u64(acc, rawsrc, 1); - return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); - }; - - auto load = [&](uint32x4_t x, uint32x4_t y) { - if constexpr (IsLarge) { - return load_src_into_floats_large(x, y); - } else { - return load_src_into_floats_small(x, y); - } - }; - - auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { - MapVectorType x = vld1q_f32(ptr_mapx); - MapVectorType y = vld1q_f32(ptr_mapy); - // Truncating convert to int - uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(x), v_xmax_); - uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(y), v_ymax_); - - // Get fractional part, or 0 if out of range - float32x4_t zero = vdupq_n_f32(0.F); - uint32x4_t x_in_range = - vandq_u32(vcgeq_f32(x, zero), vcltq_u32(x0, v_xmax_)); - uint32x4_t y_in_range = - vandq_u32(vcgeq_f32(y, zero), vcltq_u32(y0, v_ymax_)); - float32x4_t xfrac = - vbslq_f32(x_in_range, vsubq_f32(x, vrndmq_f32(x)), zero); - float32x4_t yfrac = - vbslq_f32(y_in_range, vsubq_f32(y, vrndmq_f32(y)), zero); - - // x1 = x0 + 1, except if it's already xmax or out of range - uint32x4_t x1 = vsubq_u32(x0, x_in_range); - uint32x4_t y1 = vsubq_u32(y0, y_in_range); - - // Calculate offsets from coordinates (y * stride + x) - // a: top left, b: top right, c: bottom left, d: bottom right - float32x4_t a = load(x0, y0); - float32x4_t b = load(x1, y0); - float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); - float32x4_t c = load(x0, y1); - float32x4_t d = load(x1, y1); - float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); - float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); - return vminq_u32(vdupq_n_u32(0xFFFF), vcvtaq_u32_f32(result)); - }; - - auto vector_path_2 = [&](size_t step) { // step = 2*4 = 8 - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); - - ptr_mapx += kStep; - ptr_mapy += kStep; - uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); - uint16x8_t result16 = vuzp1q_u16(res0, res1); - - vst1q_u16(&dst[0], result16); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_twice(vector_path_2); - loop.unroll_once([&](size_t step) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); - vst1_u16(&dst[0], result16); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); - vst1_u16(&dst[0], result16); - }); - } - - private: - Rows src_rows_; - uint32x2_t v_src_element_stride_; // load_large - uint32x4_t vq_src_element_stride_; // load_small - uint32x4_t v_xmax_; - uint32x4_t v_ymax_; -}; // end of class RemapF32Replicate +void remap_f32_nearest_replicate( + uint32x4_t v_xmax, uint32x4_t v_ymax, uint32x4_t v_src_stride, + Rows src_rows, Columns dst, size_t dst_width, + Columns mapx, Columns mapy, const size_t kStep) { + LoopUnroll2 loop{dst_width, kStep}; + loop.unroll_once([&](size_t x) { + transform_pixels_replicate( + vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride, + src_rows, dst.at(x)); + }); +} template -class RemapF32ConstantBorder; - -// TODO: Need to refactor to reduce the complexity -// NOLINTBEGIN(readability-function-cognitive-complexity) -template -class RemapF32ConstantBorder { - public: - using ScalarType = uint8_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32ConstantBorder(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType *border_value) - : src_rows_{src_rows}, - src_width_{src_width}, - src_height_{src_height}, - border_value_{border_value} {} - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto get_edge_pixels = - [&](unsigned &a_result, unsigned &b_result, unsigned &c_result, - unsigned &d_result, int x0, int y0, ptrdiff_t offset, - Rows src_rows, int src_width, int src_height) { - if (y0 >= 0) { - if (x0 >= 0) { - a_result = src_rows[offset]; - } - if (x0 + 1 < src_width) { - b_result = src_rows[offset + 1]; - } - } - if (y0 + 1 < src_height) { - offset += static_cast(src_rows.stride()); - if (x0 >= 0) { - c_result = src_rows[offset]; - } - if (x0 + 1 < src_width) { - d_result = src_rows[offset + 1]; - } - } - }; - - auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { - MapVectorType xf = vld1q_f32(ptr_mapx); - MapVectorType yf = vld1q_f32(ptr_mapy); - // Convert obviously out-of-range coordinates to values that are just - // beyond the largest permitted image width & height. This avoids the need - // for special case handling elsewhere. - float32x4_t big = vdupq_n_f32(1 << 24); - xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big); - yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big); - - int32x4_t x0 = vcvtmq_s32_f32(xf); - int32x4_t y0 = vcvtmq_s32_f32(yf); - int x0_array[4], y0_array[4]; - unsigned a_array[4], b_array[4], c_array[4], d_array[4]; - vst1q_s32(x0_array, x0); - vst1q_s32(y0_array, y0); - for (int i = 0; i < 4; ++i) { - int x0i = x0_array[i]; - int y0i = y0_array[i]; - ptrdiff_t offset = x0i + y0i * src_rows_.stride(); - - // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed - if (x0i < 0 || x0i + 1 >= static_cast(src_width_) || y0i < 0 || - y0i + 1 >= static_cast(src_height_)) { - // Not entirely within the source image - - a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0]; - - if (x0i < -1 || x0i >= static_cast(src_width_) || y0i < -1 || - y0i >= static_cast(src_height_)) { - // Completely outside the source image - continue; - } - - get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, - y0i, offset, src_rows_, src_width_, src_height_); - continue; - } - - // Completely inside the source image - a_array[i] = src_rows_[offset]; - b_array[i] = src_rows_[offset + 1]; - offset += src_rows_.stride(); - c_array[i] = src_rows_[offset]; - d_array[i] = src_rows_[offset + 1]; - } +void remap_f32_nearest_constant(uint32x4_t v_xmax, uint32x4_t v_ymax, + uint32x4_t v_src_stride, + Rows src_rows, + Columns dst, size_t dst_width, + Columns mapx, + Columns mapy, const size_t kStep, + ScalarType border_value) { + LoopUnroll2 loop{dst_width, kStep}; + loop.unroll_once([&](size_t x) { + transform_pixels_constant( + vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x]), v_xmax, v_ymax, v_src_stride, + src_rows, dst.at(x), border_value); + }); +} - float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf)); - float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf)); - float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array)); - float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array)); - float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); - float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array)); - float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array)); - float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); - float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); - return vcvtaq_u32_f32(result); - }; +template +void remap_f32_linear(uint32x4_t v_xmax, uint32x4_t v_ymax, + uint32x4_t v_src_stride, Rows src_rows, + Columns dst, size_t dst_width, + Columns mapx, Columns mapy, + const size_t kStep, ScalarType border_value) { + auto load_xy = [&](size_t x) { + return FloatVectorPair{vld1q_f32(&mapx[x]), vld1q_f32(&mapy[x])}; + }; + + auto vector_path = [&](size_t x) { + float32x4_t a, b, c, d, xfrac, yfrac; + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + load_quad_pixels_replicate( + load_xy(x), v_xmax, v_ymax, v_src_stride, src_rows, xfrac, yfrac, a, + b, c, d); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + load_quad_pixels_constant( + load_xy(x), v_xmax, v_ymax, v_src_stride, border_value, src_rows, + xfrac, yfrac, a, b, c, d); + } + return lerp_2d(xfrac, yfrac, a, b, c, d); + }; - auto vector_path_4 = [&](size_t step) { // step = 4*4 = 16 - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); + LoopUnroll2 loop{dst_width, kStep}; - ptr_mapx += kStep; - ptr_mapy += kStep; - uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); + if constexpr (std::is_same::value) { + loop.unroll_four_times([&](size_t x) { + ScalarType *p_dst = &dst[x]; + uint32x4_t res0 = vector_path(x); + x += kStep; + uint32x4_t res1 = vector_path(x); uint16x8_t result16_0 = vuzp1q_u16(res0, res1); - ptr_mapx += kStep; - ptr_mapy += kStep; - res0 = vector_path_1(ptr_mapx, ptr_mapy); - - ptr_mapx += kStep; - ptr_mapy += kStep; - res1 = vector_path_1(ptr_mapx, ptr_mapy); + x += kStep; + res0 = vector_path(x); + x += kStep; + res1 = vector_path(x); uint16x8_t result16_1 = vuzp1q_u16(res0, res1); - vst1q_u8(&dst[0], vuzp1q_u8(result16_0, result16_1)); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - LoopUnroll loop{width, kStep}; - loop.unroll_four_times(vector_path_4); - loop.unroll_once([&](size_t step) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - dst[0] = vgetq_lane_u32(result, 0); - dst[1] = vgetq_lane_u32(result, 1); - dst[2] = vgetq_lane_u32(result, 2); - dst[3] = vgetq_lane_u32(result, 3); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); + vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1)); }); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - dst[0] = vgetq_lane_u32(result, 0); - dst[1] = vgetq_lane_u32(result, 1); - dst[2] = vgetq_lane_u32(result, 2); - dst[3] = vgetq_lane_u32(result, 3); + loop.unroll_once([&](size_t x) { + uint32x4_t result = vector_path(x); + dst[x] = vgetq_lane_u32(result, 0); + dst[x + 1] = vgetq_lane_u32(result, 1); + dst[x + 2] = vgetq_lane_u32(result, 2); + dst[x + 3] = vgetq_lane_u32(result, 3); }); - } - - private: - Rows src_rows_; - size_t src_width_; - size_t src_height_; - const ScalarType *border_value_; -}; // end of class RemapF32ConstantBorder -// NOLINTEND(readability-function-cognitive-complexity) - -// TODO: Need to refactor to reduce the complexity -// NOLINTBEGIN(readability-function-cognitive-complexity) -template -class RemapF32ConstantBorder { - public: - using ScalarType = uint16_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32ConstantBorder(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType *border_value) - : src_rows_{src_rows}, - src_width_{src_width}, - src_height_{src_height}, - border_value_{border_value} {} - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto get_edge_pixels = [&](unsigned &a_result, unsigned &b_result, - unsigned &c_result, unsigned &d_result, int x0, - int y0, ptrdiff_t index, - Rows src_rows, int src_width, - int src_height) { - if (y0 >= 0) { - if (x0 >= 0) { - a_result = src_rows[index]; - } - if (x0 + 1 < src_width) { - b_result = src_rows[index + 1]; - } - } - if (y0 + 1 < src_height) { - index += static_cast(src_rows.stride() / sizeof(ScalarType)); - if (x0 >= 0) { - c_result = src_rows[index]; - } - if (x0 + 1 < src_width) { - d_result = src_rows[index + 1]; - } - } - }; - - auto vector_path_1 = [&](const float *ptr_mapx, const float *ptr_mapy) { - MapVectorType xf = vld1q_f32(ptr_mapx); - MapVectorType yf = vld1q_f32(ptr_mapy); - // Convert obviously out-of-range coordinates to values that are just - // beyond the largest permitted image width & height. This avoids the need - // for special case handling elsewhere. - float32x4_t big = vdupq_n_f32(1 << 24); - xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big); - yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big); - - int32x4_t x0 = vcvtmq_s32_f32(xf); - int32x4_t y0 = vcvtmq_s32_f32(yf); - int x0_array[4], y0_array[4]; - unsigned a_array[4], b_array[4], c_array[4], d_array[4]; - vst1q_s32(x0_array, x0); - vst1q_s32(y0_array, y0); - for (int i = 0; i < 4; ++i) { - int x0i = x0_array[i]; - int y0i = y0_array[i]; - ptrdiff_t index = - x0i + y0i * static_cast(src_rows_.stride() / - sizeof(ScalarType)); - // std::cout << "x0i " << x0i << " y0i " << y0i << " index: " - // << index - // xw << "\n"; - // src_width < (1ULL << 24) && src_height_ < (1ULL << 24) is guaranteed - if (x0i < 0 || x0i + 1 >= static_cast(src_width_) || y0i < 0 || - y0i + 1 >= static_cast(src_height_)) { - // Not entirely within the source image - - a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value_[0]; - - if (x0i < -1 || x0i >= static_cast(src_width_) || y0i < -1 || - y0i >= static_cast(src_height_)) { - // Completely outside the source image - continue; - } - - get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, - y0i, index, src_rows_, src_width_, src_height_); - continue; - } - - // Completely inside the source image - a_array[i] = src_rows_[index]; - b_array[i] = src_rows_[index + 1]; - index += src_rows_.stride() / sizeof(ScalarType); - c_array[i] = src_rows_[index]; - d_array[i] = src_rows_[index + 1]; - } - - float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf)); - float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf)); - float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array)); - float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array)); - float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); - float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array)); - float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array)); - float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); - float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); - return vcvtaq_u32_f32(result); - }; - - auto vector_path_2 = [&](size_t step) { // step = 2*4 = 8 - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t res0 = vector_path_1(ptr_mapx, ptr_mapy); - - ptr_mapx += kStep; - ptr_mapy += kStep; - uint32x4_t res1 = vector_path_1(ptr_mapx, ptr_mapy); - uint16x8_t result16 = vuzp1q_u16(res0, res1); - - vst1q_u16(&dst[0], result16); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_twice(vector_path_2); - loop.unroll_once([&](size_t step) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); - uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); - vst1_u16(&dst[0], result16); - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); + } else if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType *p_dst = &dst[x]; + uint32x4_t res0 = vector_path(x); + x += kStep; + uint32x4_t res1 = vector_path(x); + vst1q_u16(p_dst, vuzp1q_u16(res0, res1)); }); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t) { - const float *ptr_mapx = &mapx[0]; - const float *ptr_mapy = &mapy[0]; - uint32x4_t result = vector_path_1(ptr_mapx, ptr_mapy); + loop.unroll_once([&](size_t x) { + uint32x4_t result = vector_path(x); uint16x4_t result16 = vget_low_u16(vuzp1q_u16(result, result)); - vst1_u16(&dst[0], result16); + vst1_u16(&dst[x], result16); }); } +} - private: - Rows src_rows_; - size_t src_width_; - size_t src_height_; - const ScalarType *border_value_; -}; // end of class RemapF32ConstantBorder -// NOLINTEND(readability-function-cognitive-complexity) - -template -class RemapF32NearestReplicate; - -template -class RemapF32NearestReplicate { - public: - using ScalarType = uint8_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32NearestReplicate(Rows src_rows, size_t src_width, - size_t src_height) - : src_rows_{src_rows}, - v_src_stride_{vdupq_n_u32(static_cast(src_rows_.stride()))}, - v_xmax_{vdupq_n_u32(static_cast(src_width - 1))}, - v_ymax_{vdupq_n_u32(static_cast(src_height - 1))} {} - - void get_map_coordinates(Columns mapx, Columns mapy, - uint32x4_t &x, uint32x4_t &y) { - MapVectorType x_raw = vld1q_f32(&mapx[0]); - MapVectorType y_raw = vld1q_f32(&mapy[0]); - - MapVectorType bias = vdupq_n_f32(0.5F); - // Round to nearest positive value - uint32x4_t x_nearest = vcvtmq_u32_f32(vaddq_f32(x_raw, bias)); - uint32x4_t y_nearest = vcvtmq_u32_f32(vaddq_f32(y_raw, bias)); - - // Clamp coordinates to within the dimensions of the source image - x = vmaxq_u32(vdupq_n_u32(0), vminq_u32(x_nearest, v_xmax_)); - y = vmaxq_u32(vdupq_n_u32(0), vminq_u32(y_nearest, v_ymax_)); - } - - uint8x8_t load_pixels_large(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * stride + x) - uint64x2_t indices_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), - vget_low_u32(v_src_stride_)); - uint64x2_t indices_high = - vmlal_high_u32(vmovl_high_u32(x), y, v_src_stride_); - - uint8x8_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)], - src_rows_[vgetq_lane_u64(indices_low, 1)], - src_rows_[vgetq_lane_u64(indices_high, 0)], - src_rows_[vgetq_lane_u64(indices_high, 1)], - 0, - 0, - 0, - 0}; - return pixels; - } - - uint8x8_t load_pixels_small(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * stride + x) - uint32x4_t indices = vmlaq_u32(x, y, v_src_stride_); - - uint8x8_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)], - src_rows_[vgetq_lane_u32(indices, 1)], - src_rows_[vgetq_lane_u32(indices, 2)], - src_rows_[vgetq_lane_u32(indices, 3)], - 0, - 0, - 0, - 0}; - return pixels; - } - - void store_pixels(uint8x8_t pixels, Columns dst) { - dst[0] = vget_lane_u8(pixels, 0); - dst[1] = vget_lane_u8(pixels, 1); - dst[2] = vget_lane_u8(pixels, 2); - dst[3] = vget_lane_u8(pixels, 3); - } - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto vector_path = [&](size_t step) { - uint32x4_t x, y; - get_map_coordinates(mapx, mapy, x, y); - - uint8x8_t pixels; - if constexpr (IsLarge) { - pixels = load_pixels_large(x, y); - } else { - pixels = load_pixels_small(x, y); - } - - store_pixels(pixels, dst); - - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_once(vector_path); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t step) { vector_path(step); }); - } - - private: - Rows src_rows_; - uint32x4_t v_src_stride_; - uint32x4_t v_xmax_; - uint32x4_t v_ymax_; -}; // end of class RemapF32NearestReplicate - -template -class RemapF32NearestReplicate { - public: - using ScalarType = uint16_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32NearestReplicate(Rows src_rows, size_t src_width, - size_t src_height) - : src_rows_{src_rows}, - v_src_element_stride_{vdupq_n_u32( - static_cast(src_rows_.stride() / sizeof(ScalarType)))}, - v_xmax_{vdupq_n_u32(static_cast(src_width - 1))}, - v_ymax_{vdupq_n_u32(static_cast(src_height - 1))} {} - - void get_map_coordinates(Columns mapx, Columns mapy, - uint32x4_t &x, uint32x4_t &y) { - MapVectorType x_raw = vld1q_f32(&mapx[0]); - MapVectorType y_raw = vld1q_f32(&mapy[0]); - - MapVectorType bias = vdupq_n_f32(0.5F); - // Round to nearest positive value - uint32x4_t x_nearest = vcvtmq_u32_f32(vaddq_f32(x_raw, bias)); - uint32x4_t y_nearest = vcvtmq_u32_f32(vaddq_f32(y_raw, bias)); - - // Clamp coordinates to within the dimensions of the source image - x = vmaxq_u32(vdupq_n_u32(0), vminq_u32(x_nearest, v_xmax_)); - y = vmaxq_u32(vdupq_n_u32(0), vminq_u32(y_nearest, v_ymax_)); - } - - uint16x4_t load_pixels_large(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * element_stride + x) - uint64x2_t indices_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), - vget_low_u32(v_src_element_stride_)); - uint64x2_t indices_high = - vmlal_high_u32(vmovl_high_u32(x), y, v_src_element_stride_); - - uint16x4_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)], - src_rows_[vgetq_lane_u64(indices_low, 1)], - src_rows_[vgetq_lane_u64(indices_high, 0)], - src_rows_[vgetq_lane_u64(indices_high, 1)]}; - return pixels; - } - - uint16x4_t load_pixels_small(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * element_stride + x) - uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride_); - - uint16x4_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)], - src_rows_[vgetq_lane_u32(indices, 1)], - src_rows_[vgetq_lane_u32(indices, 2)], - src_rows_[vgetq_lane_u32(indices, 3)]}; - return pixels; - } - - void store_pixels(uint16x4_t pixels, Columns dst) { - vst1_u16(&dst[0], pixels); - } - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto vector_path = [&](size_t step) { - uint32x4_t x, y; - get_map_coordinates(mapx, mapy, x, y); - - uint16x4_t pixels; - if constexpr (IsLarge) { - pixels = load_pixels_large(x, y); - } else { - pixels = load_pixels_small(x, y); - } - - store_pixels(pixels, dst); - - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_once(vector_path); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t step) { vector_path(step); }); - } - - private: - Rows src_rows_; - uint32x4_t v_src_element_stride_; - uint32x4_t v_xmax_; - uint32x4_t v_ymax_; -}; // end of class RemapF32NearestReplicate - -template -class RemapF32NearestConstant; - -template -class RemapF32NearestConstant { - public: - using ScalarType = uint8_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32NearestConstant(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType *border_value) - : src_rows_{src_rows}, - v_src_stride_{vdupq_n_u32(static_cast(src_rows_.stride()))}, - v_width_{vdupq_n_u32(static_cast(src_width))}, - v_height_{vdupq_n_u32(static_cast(src_height))}, - v_border_{vdup_n_u8(*border_value)} {} - - void get_map_coordinates(Columns mapx, Columns mapy, - uint32x4_t &x, uint32x4_t &y, uint32x4_t &in_range) { - MapVectorType x_raw = vld1q_f32(&mapx[0]); - MapVectorType y_raw = vld1q_f32(&mapy[0]); - - MapVectorType bias = vdupq_n_f32(0.5F); - float32x4_t x_biased = vaddq_f32(x_raw, bias); - float32x4_t y_biased = vaddq_f32(y_raw, bias); - - // Round to nearest positive value - uint32x4_t x_nearest = vcvtmq_u32_f32(x_biased); - uint32x4_t y_nearest = vcvtmq_u32_f32(y_biased); - - // Find whether coordinates are within the image dimensions. - uint32x4_t above_zero = - vandq_u32(vcgezq_f32(x_biased), vcgezq_f32(y_biased)); - uint32x4_t below_limits = vandq_u32(vcltq_u32(x_nearest, v_width_), - vcltq_u32(y_nearest, v_height_)); - in_range = vandq_u32(above_zero, below_limits); - - // Zero out-of-range coordinates. - x = vandq_u32(in_range, x_nearest); - y = vandq_u32(in_range, y_nearest); - } - - uint8x8_t load_pixels_large(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * stride + x) - uint64x2_t indices_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), - vget_low_u32(v_src_stride_)); - uint64x2_t indices_high = - vmlal_high_u32(vmovl_high_u32(x), y, v_src_stride_); - - uint8x8_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)], - src_rows_[vgetq_lane_u64(indices_low, 1)], - src_rows_[vgetq_lane_u64(indices_high, 0)], - src_rows_[vgetq_lane_u64(indices_high, 1)], - 0, - 0, - 0, - 0}; - return pixels; - } - - uint8x8_t load_pixels_small(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * stride + x) - uint32x4_t indices = vmlaq_u32(x, y, v_src_stride_); - - uint8x8_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)], - src_rows_[vgetq_lane_u32(indices, 1)], - src_rows_[vgetq_lane_u32(indices, 2)], - src_rows_[vgetq_lane_u32(indices, 3)], - 0, - 0, - 0, - 0}; - return pixels; - } - - void store_pixels(uint8x8_t pixels, Columns dst) { - dst[0] = vget_lane_u8(pixels, 0); - dst[1] = vget_lane_u8(pixels, 1); - dst[2] = vget_lane_u8(pixels, 2); - dst[3] = vget_lane_u8(pixels, 3); - } - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto vector_path = [&](size_t step) { - uint32x4_t x, y; - uint32x4_t in_range; - get_map_coordinates(mapx, mapy, x, y, in_range); - - uint8x8_t pixels; - if constexpr (IsLarge) { - pixels = load_pixels_large(x, y); - } else { - pixels = load_pixels_small(x, y); - } - - // Select between source pixels and border colour - uint8x8_t in_range_narrowed = - vmovn_u16(vcombine_u16(vmovn_u32(in_range), vdup_n_u16(0))); - uint8x8_t pixels_or_border = - vbsl_u8(in_range_narrowed, pixels, v_border_); - - store_pixels(pixels_or_border, dst); - - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_once(vector_path); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t step) { vector_path(step); }); - } - - private: - Rows src_rows_; - uint32x4_t v_src_stride_; - uint32x4_t v_width_; - uint32x4_t v_height_; - uint8x8_t v_border_; -}; // end of class RemapF32NearestConstant - -template -class RemapF32NearestConstant { - public: - using ScalarType = uint16_t; - using MapVecTraits = neon::VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; // float32x4_t - - RemapF32NearestConstant(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType *border_value) - : src_rows_{src_rows}, - v_src_element_stride_{vdupq_n_u32( - static_cast(src_rows_.stride() / sizeof(ScalarType)))}, - v_width_{vdupq_n_u32(static_cast(src_width))}, - v_height_{vdupq_n_u32(static_cast(src_height))}, - v_border_{vdup_n_u16(*border_value)} {} - - void get_map_coordinates(Columns mapx, Columns mapy, - uint32x4_t &x, uint32x4_t &y, uint32x4_t &in_range) { - MapVectorType x_raw = vld1q_f32(&mapx[0]); - MapVectorType y_raw = vld1q_f32(&mapy[0]); - - MapVectorType bias = vdupq_n_f32(0.5F); - float32x4_t x_biased = vaddq_f32(x_raw, bias); - float32x4_t y_biased = vaddq_f32(y_raw, bias); - - // Round to nearest positive value - uint32x4_t x_nearest = vcvtmq_u32_f32(x_biased); - uint32x4_t y_nearest = vcvtmq_u32_f32(y_biased); - - // Find whether coordinates are within the image dimensions. - uint32x4_t above_zero = - vandq_u32(vcgezq_f32(x_biased), vcgezq_f32(y_biased)); - uint32x4_t below_limits = vandq_u32(vcltq_u32(x_nearest, v_width_), - vcltq_u32(y_nearest, v_height_)); - in_range = vandq_u32(above_zero, below_limits); - - // Zero out-of-range coordinates. - x = vandq_u32(in_range, x_nearest); - y = vandq_u32(in_range, y_nearest); - } - - uint16x4_t load_pixels_large(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * stride + x) - uint64x2_t indices_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), - vget_low_u32(v_src_element_stride_)); - uint64x2_t indices_high = - vmlal_high_u32(vmovl_high_u32(x), y, v_src_element_stride_); - - uint16x4_t pixels = {src_rows_[vgetq_lane_u64(indices_low, 0)], - src_rows_[vgetq_lane_u64(indices_low, 1)], - src_rows_[vgetq_lane_u64(indices_high, 0)], - src_rows_[vgetq_lane_u64(indices_high, 1)]}; - return pixels; - } - - uint16x4_t load_pixels_small(uint32x4_t x, uint32x4_t y) { - // Calculate offsets from coordinates (y * stride + x) - uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride_); - - uint16x4_t pixels = {src_rows_[vgetq_lane_u32(indices, 0)], - src_rows_[vgetq_lane_u32(indices, 1)], - src_rows_[vgetq_lane_u32(indices, 2)], - src_rows_[vgetq_lane_u32(indices, 3)]}; - return pixels; - } - - void store_pixels(uint16x4_t pixels, Columns dst) { - vst1_u16(&dst[0], pixels); - } - - void process_row(size_t width, Columns mapx, - Columns mapy, Columns dst) { - const size_t kStep = VecTraits::num_lanes(); - - auto vector_path = [&](size_t step) { - uint32x4_t x, y; - uint32x4_t in_range; - get_map_coordinates(mapx, mapy, x, y, in_range); - - uint16x4_t pixels; - if constexpr (IsLarge) { - pixels = load_pixels_large(x, y); +template +void transform_operation(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value, + Rows dst_rows, size_t dst_width, + size_t y_begin, size_t y_end, + Rows mapx_rows, + Rows mapy_rows) { + uint32x4_t v_src_stride = vdupq_n_u32( + static_cast(src_rows.stride() / sizeof(ScalarType))); + uint32x4_t v_xmax = vdupq_n_u32(static_cast(src_width - 1)); + uint32x4_t v_ymax = vdupq_n_u32(static_cast(src_height - 1)); + const size_t kStep = VecTraits::num_lanes(); + + for (size_t y = y_begin; y < y_end; ++y) { + if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + remap_f32_nearest_replicate( + v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(), + dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep); } else { - pixels = load_pixels_small(x, y); + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + remap_f32_nearest_constant( + v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(), + dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep, + border_value[0]); } - - // Select between source pixels and border colour - uint16x4_t in_range_narrowed = vmovn_u32(in_range); - uint16x4_t pixels_or_border = - vbsl_u16(in_range_narrowed, pixels, v_border_); - - store_pixels(pixels_or_border, dst); - - mapx += ptrdiff_t(step); - mapy += ptrdiff_t(step); - dst += ptrdiff_t(step); - }; - - LoopUnroll loop{width, kStep}; - loop.unroll_once(vector_path); - ptrdiff_t back_step = static_cast(loop.step()) - - static_cast(loop.remaining_length()); - mapx -= back_step; - mapy -= back_step; - dst -= back_step; - loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } else { + static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR); + remap_f32_linear( + v_xmax, v_ymax, v_src_stride, src_rows, dst_rows.as_columns(), + dst_width, mapx_rows.as_columns(), mapy_rows.as_columns(), kStep, + Border == KLEIDICV_BORDER_TYPE_CONSTANT ? border_value[0] : 0); + } + ++mapx_rows; + ++mapy_rows; + ++dst_rows; } - - private: - Rows src_rows_; - uint32x4_t v_src_element_stride_; - uint32x4_t v_width_; - uint32x4_t v_height_; - uint16x4_t v_border_; -}; // end of class RemapF32NearestConstant +} // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) @@ -1101,8 +158,7 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, const float *mapy, size_t mapy_stride, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, - [[maybe_unused]] const T *border_value) { - // may need to remove the maybe_unused + const T *border_value) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height); @@ -1120,7 +176,9 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, // Calculating in float32_t will only be precise until 24 bits if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || - dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24)) { + dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || + // Empty source image is not supported + src_width == 0 || src_height == 0) { return KLEIDICV_ERROR_RANGE; } @@ -1130,52 +188,10 @@ kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, Rows dst_rows{dst, dst_stride, channels}; Rectangle rect{dst_width, dst_height}; - if (interpolation == KLEIDICV_INTERPOLATION_LINEAR) { - if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - RemapF32ConstantBorder operation{src_rows, src_width, - src_height, border_value}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } else { - RemapF32ConstantBorder operation{src_rows, src_width, - src_height, border_value}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } - } else { - assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); - if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - RemapF32Replicate operation{src_rows, src_width, src_height}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } else { - RemapF32Replicate operation{src_rows, src_width, src_height}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } - } - } else { - assert(interpolation == KLEIDICV_INTERPOLATION_NEAREST); - if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - RemapF32NearestConstant operation{src_rows, src_width, - src_height, border_value}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } else { - RemapF32NearestConstant operation{src_rows, src_width, - src_height, border_value}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } - } else { - assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); - if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - RemapF32NearestReplicate operation{src_rows, src_width, - src_height}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } else { - RemapF32NearestReplicate operation{src_rows, src_width, - src_height}; - zip_rows(operation, rect, mapx_rows, mapy_rows, dst_rows); - } - } - } + transform_operation(is_image_large(src_rows, src_height), interpolation, + border_type, src_rows, src_width, src_height, + border_value, dst_rows, dst_width, 0, dst_height, + mapx_rows, mapy_rows); return KLEIDICV_OK; } diff --git a/kleidicv/src/transform/remap_f32_sve2.cpp b/kleidicv/src/transform/remap_f32_sve2.cpp new file mode 100644 index 000000000..e4ad3e724 --- /dev/null +++ b/kleidicv/src/transform/remap_f32_sve2.cpp @@ -0,0 +1,321 @@ +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include +#include + +#include "kleidicv/sve2.h" +#include "kleidicv/transform/remap.h" +#include "transform_sve2.h" + +namespace kleidicv::sve2 { + +template +void remap_f32_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax, + svuint32_t sv_src_stride, + Rows src_rows, svuint32_t sv_border, + Columns dst, size_t kStep, size_t dst_width, + Rows mapx_rows, + Rows mapy_rows) { + svbool_t pg_all32 = svptrue_b32(); + auto load_coords = [&](svbool_t pg, size_t xs) { + auto x = static_cast(xs); + return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]), + svld1_f32(pg, &mapy_rows.as_columns()[x])); + }; + + auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) { + svuint32_t x = svget2(coords, 0); + svuint32_t y = svget2(coords, 1); + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), + svcmple_u32(pg, y, sv_ymax)); + svuint32_t result = + load_xy(in_range, x, y, sv_src_stride, src_rows); + // Select between source pixels and border colour + return svsel_u32(in_range, result, sv_border); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE); + return load_xy(pg, x, y, sv_src_stride, src_rows); + } + }; + + auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) { + svfloat32x2_t coords = load_coords(pg32, x); + svfloat32_t xf = svget2(coords, 0); + svfloat32_t yf = svget2(coords, 1); + + svuint32_t xi, yi; + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + // Convert coordinates to integers. + // Negative numbers will become large positive numbers. + // Since the source width and height is known to be <=2^24 these large + // positive numbers will always be treated as outside the source image + // bounds. + xi = svreinterpret_u32_s32( + svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf))); + yi = svreinterpret_u32_s32( + svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf))); + } else { + // Round to the nearest integer, clamp it to within the dimensions of + // the source image (negative values are already saturated to 0) + xi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), + sv_xmax); + yi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), + sv_ymax); + } + return svcreate2(xi, yi); + }; + + LoopUnroll2 loop{dst_width, kStep}; + + if constexpr (std::is_same::value) { + auto vector_path_generic = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32_t result = + get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); + svst1b_u32(pg32, &dst[static_cast(x)], result); + }; + + loop.unroll_four_times([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res32_0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + svuint32_t res32_1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + x += kStep; + res32_0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + res32_1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0), + svreinterpret_u8_u16(result1)); + svst1(svptrue_b8(), p_dst, result); + }); + loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); }); + loop.remaining( + [&](size_t x, size_t length) { vector_path_generic(x, length, dst); }); + } + + if constexpr (std::is_same::value) { + auto vector_path_generic = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32_t result = + get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); + svst1h_u32(pg32, &dst[static_cast(x)], result); + }; + + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res32_0 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + svuint32_t res32_1 = + get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + svst1(svptrue_b16(), p_dst, result); + }); + loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); }); + loop.remaining( + [&](size_t x, size_t length) { vector_path_generic(x, length, dst); }); + } +} + +template +void remap_f32_linear(svuint32_t sv_xmax, svuint32_t sv_ymax, + svfloat32_t sv_xmaxf, svfloat32_t sv_ymaxf, + svuint32_t sv_src_stride, Rows src_rows, + svuint32_t sv_border, Columns dst, + size_t kStep, size_t dst_width, + Rows mapx_rows, + Rows mapy_rows) { + auto load_coords = [&](svbool_t pg, size_t xs) { + auto x = static_cast(xs); + return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]), + svld1_f32(pg, &mapy_rows.as_columns()[x])); + }; + + auto calculate_linear = [&](svbool_t pg, uint32_t x) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + svfloat32x2_t coords = load_coords(pg, x); + return calculate_linear_replicated_border( + pg, coords, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + svfloat32x2_t coords = load_coords(pg, x); + return calculate_linear_constant_border( + pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); + } + }; + + auto store_vector = [](svbool_t pg32, ScalarType* p_dst, svuint32_t result) { + if constexpr (std::is_same::value) { + svst1b_u32(pg32, p_dst, result); + } + if constexpr (std::is_same::value) { + svst1h_u32(pg32, p_dst, result); + } + }; + + svbool_t pg_all32 = svptrue_b32(); + LoopUnroll2 loop{dst_width, kStep}; + if constexpr (std::is_same::value) { + loop.unroll_four_times([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16_0 = + svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); + x += kStep; + res0 = calculate_linear(pg_all32, x); + x += kStep; + res1 = calculate_linear(pg_all32, x); + svuint16_t result16_1 = + svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); + svst1_u8(svptrue_b8(), p_dst, + svuzp1_u8(svreinterpret_u8_u16(result16_0), + svreinterpret_u8_u16(result16_1))); + }); + } else if constexpr (std::is_same::value) { + loop.unroll_twice([&](size_t x) { + ScalarType* p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16 = + svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); + svst1_u16(svptrue_b16(), p_dst, result16); + }); + } + loop.unroll_once([&](size_t x) { + svuint32_t result = calculate_linear(pg_all32, x); + store_vector(pg_all32, &dst[static_cast(x)], result); + }); + loop.remaining([&](size_t x, size_t x_max) { + svbool_t pg32 = svwhilelt_b32(x, x_max); + svuint32_t result = calculate_linear(pg32, x); + store_vector(pg32, &dst[static_cast(x)], result); + }); +} + +template +void transform_operation(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType* border_value, + Rows dst_rows, size_t dst_width, + size_t y_begin, size_t y_end, + Rows mapx_rows, + Rows mapy_rows) { + svuint32_t sv_xmax = svdup_n_u32(src_width - 1); + svuint32_t sv_ymax = svdup_n_u32(src_height - 1); + svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); + svuint32_t sv_border = svdup_n_u32(0); + + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + sv_border = svdup_n_u32(border_value[0]); + } + + svfloat32_t sv_xmaxf = svdup_n_f32(static_cast(src_width - 1)); + svfloat32_t sv_ymaxf = svdup_n_f32(static_cast(src_height - 1)); + + const size_t kStep = VecTraits::num_lanes(); + + for (size_t y = y_begin; y < y_end; ++y) { + Columns dst = dst_rows.as_columns(); + if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { + remap_f32_nearest( + sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep, + dst_width, mapx_rows, mapy_rows); + } else { + static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR); + remap_f32_linear( + sv_xmax, sv_ymax, sv_xmaxf, sv_ymaxf, sv_src_stride, src_rows, + sv_border, dst, kStep, dst_width, mapx_rows, mapy_rows); + } + ++mapx_rows; + ++mapy_rows; + ++dst_rows; + } +} + +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) +template +kleidicv_error_t remap_f32(const T* src, size_t src_stride, size_t src_width, + size_t src_height, T* dst, size_t dst_stride, + size_t dst_width, size_t dst_height, size_t channels, + const float* mapx, size_t mapx_stride, + const float* mapy, size_t mapy_stride, + kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type, + const T* border_value) { + CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); + CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height); + CHECK_POINTER_AND_STRIDE(mapy, mapy_stride, dst_height); + CHECK_IMAGE_SIZE(src_width, src_height); + CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } + + if (!remap_f32_is_implemented(src_stride, src_width, src_height, dst_width, + border_type, channels, interpolation)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + // Calculating in float32_t will only be precise until 24 bits + if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || + dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || + // Empty source image is not supported + src_width == 0 || src_height == 0) { + return KLEIDICV_ERROR_RANGE; + } + + Rows src_rows{src, src_stride, channels}; + Rows mapx_rows{mapx, mapx_stride, 1}; + Rows mapy_rows{mapy, mapy_stride, 1}; + Rows dst_rows{dst, dst_stride, channels}; + Rectangle rect{dst_width, dst_height}; + + transform_operation(is_image_large(src_rows, src_height), interpolation, + border_type, src_rows, src_width, src_height, + border_value, dst_rows, dst_width, 0, dst_height, + mapx_rows, mapy_rows); + + return KLEIDICV_OK; +} +// NOLINTEND(readability-function-cognitive-complexity) + +#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32( \ + const type* src, size_t src_stride, size_t src_width, size_t src_height, \ + type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ + size_t channels, const float* mapx, size_t mapx_stride, \ + const float* mapy, size_t mapy_stride, \ + kleidicv_interpolation_type_t interpolation, \ + kleidicv_border_type_t border_type, const type* border_value) + +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t); + +} // namespace kleidicv::sve2 diff --git a/kleidicv/src/transform/remap_s16_sve2.cpp b/kleidicv/src/transform/remap_s16_sve2.cpp new file mode 100644 index 000000000..4d5442c91 --- /dev/null +++ b/kleidicv/src/transform/remap_s16_sve2.cpp @@ -0,0 +1,277 @@ +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include +#include + +#include "kleidicv/sve2.h" +#include "kleidicv/transform/remap.h" +#include "transform_sve2.h" + +namespace kleidicv::sve2 { + +template +class RemapS16Replicate { + public: + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + + RemapS16Replicate(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_element_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_element_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, + svuint32_t offsets_t, svbool_t pg_t, + Columns dst); + + void process_row(size_t width, Columns mapxy, + Columns dst) { + svuint32_t offsets_b, offsets_t; + svint16_t svzero = svdup_n_s16(0); + auto load_offsets = [&](svbool_t pg) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + // Clamp coordinates to within the dimensions of the source image + svuint16_t x = svreinterpret_u16_s16( + svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y = svreinterpret_u16_s16( + svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_))); + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); + offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); + }; + + svbool_t pg_all16 = MapVecTraits::svptrue(); + svbool_t pg_all32 = svptrue_b32(); + + auto gather_load_generic_vector_path = [&](svbool_t pg, ptrdiff_t step) { + load_offsets(pg); + svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); + svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); + transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, dst); + mapxy += step; + dst += step; + }; + + // NOTE: gather load is not available in streaming mode + auto gather_load_full_vector_path = [&](ptrdiff_t step) { + load_offsets(pg_all16); + transform_pixels(pg_all16, offsets_b, pg_all32, offsets_t, pg_all32, dst); + mapxy += step; + dst += step; + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + gather_load_full_vector_path(static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + gather_load_generic_vector_path(pg, static_cast(length)); + }); + } + + private: + Rows src_rows_; + svuint16_t& v_src_element_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Replicate + +template <> +void RemapS16Replicate::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, Columns dst) { + // Copy pixels from source + svuint32_t result_b = + svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + svst1b_u16(pg, &dst[0], result); +} + +template <> +void RemapS16Replicate::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, Columns dst) { + // Account for the size of the source type when calculating offset + offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); + offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); + + // Copy pixels from source + svuint32_t result_b = + svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + svst1_u16(pg, &dst[0], result); +} + +template +class RemapS16ConstantBorder { + public: + RemapS16ConstantBorder(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType* border_value, + svuint16_t& v_src_element_stride, svuint16_t& v_width, + svuint16_t& v_height, svuint16_t& v_border) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_element_stride}, + v_width_{v_width}, + v_height_{v_height}, + v_border_{v_border} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_width_ = svdup_u16(static_cast(src_width)); + v_height_ = svdup_u16(static_cast(src_height)); + v_border_ = svdup_u16(*border_value); + } + + void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, + svuint32_t offsets_t, svbool_t pg_t, ScalarType* dst); + + void process_row(size_t width, Columns mapxy, + Columns dst) { + for (size_t i = 0; i < width; i += svcnth()) { + svbool_t pg = svwhilelt_b16(i, width); + + svint16x2_t xy = svld2_s16(pg, &mapxy[static_cast(i * 2)]); + svuint16_t x = svreinterpret_u16_s16(svget2(xy, 0)); + svuint16_t y = svreinterpret_u16_s16(svget2(xy, 1)); + + // Find whether coordinates are within the image dimensions. + svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), + svcmplt_u16(pg, y, v_height_)); + svbool_t pg_b = in_range; + svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); + + // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) + svuint32_t offsets_b = + svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); + svuint32_t offsets_t = + svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); + + transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, + &dst[static_cast(i)]); + } + } + + private: + Rows src_rows_; + svuint16_t& v_src_element_stride_; + svuint16_t& v_width_; + svuint16_t& v_height_; + svuint16_t& v_border_; +}; // end of class RemapS16ConstantBorder + +template <> +void RemapS16ConstantBorder::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, uint8_t* dst) { + // Copy pixels from source + svuint32_t result_b = + svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + svuint16_t result_selected = svsel(pg_b, result, v_border_); + svst1b_u16(pg, dst, result_selected); +} + +template <> +void RemapS16ConstantBorder::transform_pixels( + svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, + svbool_t pg_t, uint16_t* dst) { + // Account for the size of the source type when calculating offset + offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); + offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); + + // Copy pixels from source + svuint32_t result_b = + svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); + svuint32_t result_t = + svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); + + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + + svuint16_t result_selected = svsel(pg_b, result, v_border_); + svst1_u16(pg, dst, result_selected); +} + +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) +template +kleidicv_error_t remap_s16(const T* src, size_t src_stride, size_t src_width, + size_t src_height, T* dst, size_t dst_stride, + size_t dst_width, size_t dst_height, size_t channels, + const int16_t* mapxy, size_t mapxy_stride, + kleidicv_border_type_t border_type, + const T* border_value) { + CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); + CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); + CHECK_IMAGE_SIZE(src_width, src_height); + CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } + + if (!remap_s16_is_implemented(src_stride, src_width, src_height, dst_width, + border_type, channels)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + Rows src_rows{src, src_stride, channels}; + Rows mapxy_rows{mapxy, mapxy_stride, 2}; + Rows dst_rows{dst, dst_stride, channels}; + svuint16_t sv_src_element_stride; + Rectangle rect{dst_width, dst_height}; + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { + svuint16_t sv_width, sv_height, sv_border; + RemapS16ConstantBorder operation{ + src_rows, src_width, src_height, border_value, sv_src_element_stride, + sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); + svint16_t sv_xmax, sv_ymax; + RemapS16Replicate operation{src_rows, src_width, + src_height, sv_src_element_stride, + sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + } + return KLEIDICV_OK; +} +// NOLINTEND(readability-function-cognitive-complexity) + +#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16( \ + const type* src, size_t src_stride, size_t src_width, size_t src_height, \ + type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ + size_t channels, const int16_t* mapxy, size_t mapxy_stride, \ + kleidicv_border_type_t border_type, const type* border_value) + +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t); + +} // namespace kleidicv::sve2 diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_s16point5_sve2.cpp similarity index 50% rename from kleidicv/src/transform/remap_sc.h rename to kleidicv/src/transform/remap_s16point5_sve2.cpp index a0e27f169..489d37386 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_s16point5_sve2.cpp @@ -2,274 +2,18 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_REMAP_SC_H -#define KLEIDICV_REMAP_SC_H - #include -#include #include #include #include #include -#include -#include -#include "common_sc.h" #include "kleidicv/sve2.h" #include "kleidicv/transform/remap.h" +#include "transform_sve2.h" -namespace KLEIDICV_TARGET_NAMESPACE { - -template -class RemapS16Replicate { - public: - using MapVecTraits = VecTraits; - using MapVectorType = typename MapVecTraits::VectorType; - using MapVector2Type = typename MapVecTraits::Vector2Type; - - RemapS16Replicate(Rows src_rows, size_t src_width, - size_t src_height, svuint16_t& v_src_element_stride, - MapVectorType& v_x_max, MapVectorType& v_y_max) - : src_rows_{src_rows}, - v_src_element_stride_{v_src_element_stride}, - v_xmax_{v_x_max}, - v_ymax_{v_y_max} { - v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); - v_xmax_ = svdup_s16(static_cast(src_width - 1)); - v_ymax_ = svdup_s16(static_cast(src_height - 1)); - } - - void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, - svuint32_t offsets_t, svbool_t pg_t, - Columns dst); - - void process_row(size_t width, Columns mapxy, - Columns dst) { - svuint32_t offsets_b, offsets_t; - svint16_t svzero = svdup_n_s16(0); - auto load_offsets = [&](svbool_t pg) { - MapVector2Type xy = svld2_s16(pg, &mapxy[0]); - // Clamp coordinates to within the dimensions of the source image - svuint16_t x = svreinterpret_u16_s16( - svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 0), v_xmax_))); - svuint16_t y = svreinterpret_u16_s16( - svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_))); - // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) - offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); - offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); - }; - - svbool_t pg_all16 = MapVecTraits::svptrue(); - svbool_t pg_all32 = svptrue_b32(); - - auto gather_load_generic_vector_path = [&](svbool_t pg, ptrdiff_t step) { - load_offsets(pg); - svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); - svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); - transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, dst); - mapxy += step; - dst += step; - }; - - // NOTE: gather load is not available in streaming mode - auto gather_load_full_vector_path = [&](ptrdiff_t step) { - load_offsets(pg_all16); - transform_pixels(pg_all16, offsets_b, pg_all32, offsets_t, pg_all32, dst); - mapxy += step; - dst += step; - }; - - LoopUnroll loop{width, MapVecTraits::num_lanes()}; - loop.unroll_once([&](size_t step) { - gather_load_full_vector_path(static_cast(step)); - }); - loop.remaining([&](size_t length, size_t step) { - svbool_t pg = MapVecTraits::svwhilelt(step - length, step); - gather_load_generic_vector_path(pg, static_cast(length)); - }); - } - - private: - Rows src_rows_; - svuint16_t& v_src_element_stride_; - MapVectorType& v_xmax_; - MapVectorType& v_ymax_; -}; // end of class RemapS16Replicate - -template <> -void RemapS16Replicate::transform_pixels( - svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, - svbool_t pg_t, Columns dst) { - // Copy pixels from source - svuint32_t result_b = - svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); - svuint32_t result_t = - svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); - svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), - svreinterpret_u16_u32(result_t)); - - svst1b_u16(pg, &dst[0], result); -} - -template <> -void RemapS16Replicate::transform_pixels( - svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, - svbool_t pg_t, Columns dst) { - // Account for the size of the source type when calculating offset - offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); - offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); - - // Copy pixels from source - svuint32_t result_b = - svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); - svuint32_t result_t = - svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); - svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), - svreinterpret_u16_u32(result_t)); - - svst1_u16(pg, &dst[0], result); -} - -template -class RemapS16ConstantBorder { - public: - RemapS16ConstantBorder(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType* border_value, - svuint16_t& v_src_element_stride, svuint16_t& v_width, - svuint16_t& v_height, svuint16_t& v_border) - : src_rows_{src_rows}, - v_src_element_stride_{v_src_element_stride}, - v_width_{v_width}, - v_height_{v_height}, - v_border_{v_border} { - v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); - v_width_ = svdup_u16(static_cast(src_width)); - v_height_ = svdup_u16(static_cast(src_height)); - v_border_ = svdup_u16(*border_value); - } - - void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, - svuint32_t offsets_t, svbool_t pg_t, ScalarType* dst); - - void process_row(size_t width, Columns mapxy, - Columns dst) { - for (size_t i = 0; i < width; i += svcnth()) { - svbool_t pg = svwhilelt_b16(i, width); - - svint16x2_t xy = svld2_s16(pg, &mapxy[static_cast(i * 2)]); - svuint16_t x = svreinterpret_u16_s16(svget2(xy, 0)); - svuint16_t y = svreinterpret_u16_s16(svget2(xy, 1)); - - // Find whether coordinates are within the image dimensions. - svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), - svcmplt_u16(pg, y, v_height_)); - svbool_t pg_b = in_range; - svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); - - // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) - svuint32_t offsets_b = - svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); - svuint32_t offsets_t = - svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); - - transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, - &dst[static_cast(i)]); - } - } - - private: - Rows src_rows_; - svuint16_t& v_src_element_stride_; - svuint16_t& v_width_; - svuint16_t& v_height_; - svuint16_t& v_border_; -}; // end of class RemapS16ConstantBorder - -template <> -void RemapS16ConstantBorder::transform_pixels( - svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, - svbool_t pg_t, uint8_t* dst) { - // Copy pixels from source - svuint32_t result_b = - svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); - svuint32_t result_t = - svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); - - svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), - svreinterpret_u16_u32(result_t)); - - svuint16_t result_selected = svsel(pg_b, result, v_border_); - svst1b_u16(pg, dst, result_selected); -} - -template <> -void RemapS16ConstantBorder::transform_pixels( - svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t, - svbool_t pg_t, uint16_t* dst) { - // Account for the size of the source type when calculating offset - offsets_b = svlsl_n_u32_x(pg, offsets_b, 1); - offsets_t = svlsl_n_u32_x(pg, offsets_t, 1); - - // Copy pixels from source - svuint32_t result_b = - svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); - svuint32_t result_t = - svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); - - svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), - svreinterpret_u16_u32(result_t)); - - svuint16_t result_selected = svsel(pg_b, result, v_border_); - svst1_u16(pg, dst, result_selected); -} - -// Most of the complexity comes from parameter checking. -// NOLINTBEGIN(readability-function-cognitive-complexity) -template -kleidicv_error_t remap_s16_sc(const T* src, size_t src_stride, size_t src_width, - size_t src_height, T* dst, size_t dst_stride, - size_t dst_width, size_t dst_height, - size_t channels, const int16_t* mapxy, - size_t mapxy_stride, - kleidicv_border_type_t border_type, - const T* border_value) { - CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); - CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); - CHECK_IMAGE_SIZE(src_width, src_height); - CHECK_IMAGE_SIZE(dst_width, dst_height); - if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { - return KLEIDICV_ERROR_NULL_POINTER; - } - - if (!remap_s16_is_implemented(src_stride, src_width, src_height, dst_width, - border_type, channels)) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; - } - - Rows src_rows{src, src_stride, channels}; - Rows mapxy_rows{mapxy, mapxy_stride, 2}; - Rows dst_rows{dst, dst_stride, channels}; - svuint16_t sv_src_element_stride; - Rectangle rect{dst_width, dst_height}; - if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - svuint16_t sv_width, sv_height, sv_border; - RemapS16ConstantBorder operation{ - src_rows, src_width, src_height, border_value, sv_src_element_stride, - sv_width, sv_height, sv_border}; - zip_rows(operation, rect, mapxy_rows, dst_rows); - } else { - assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); - svint16_t sv_xmax, sv_ymax; - RemapS16Replicate operation{src_rows, src_width, - src_height, sv_src_element_stride, - sv_xmax, sv_ymax}; - zip_rows(operation, rect, mapxy_rows, dst_rows); - } - return KLEIDICV_OK; -} -// NOLINTEND(readability-function-cognitive-complexity) +namespace kleidicv::sve2 { template inline svuint16_t interpolate_16point5(svbool_t pg, svuint16_t frac, @@ -778,12 +522,14 @@ class RemapS16Point5ConstantBorder { // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template -kleidicv_error_t remap_s16point5_sc( - const T* src, size_t src_stride, size_t src_width, size_t src_height, - T* dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t channels, const int16_t* mapxy, size_t mapxy_stride, - const uint16_t* mapfrac, size_t mapfrac_stride, - kleidicv_border_type_t border_type, const T* border_value) { +kleidicv_error_t remap_s16point5(const T* src, size_t src_stride, + size_t src_width, size_t src_height, T* dst, + size_t dst_stride, size_t dst_width, + size_t dst_height, size_t channels, + const int16_t* mapxy, size_t mapxy_stride, + const uint16_t* mapfrac, size_t mapfrac_stride, + kleidicv_border_type_t border_type, + const T* border_value) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); @@ -821,291 +567,17 @@ kleidicv_error_t remap_s16point5_sc( } return KLEIDICV_OK; } - -template -void remap32f_nearest(svuint32_t sv_xmax, svuint32_t sv_ymax, - svuint32_t sv_src_stride, Rows src_rows, - svuint32_t sv_border, Columns dst, - size_t kStep, size_t dst_width, - Rows mapx_rows, - Rows mapy_rows) { - svbool_t pg_all32 = svptrue_b32(); - auto load_coords = [&](svbool_t pg, size_t xs) { - auto x = static_cast(xs); - return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]), - svld1_f32(pg, &mapy_rows.as_columns()[x])); - }; - - auto get_pixels = [&](svbool_t pg, svuint32x2_t coords) { - svuint32_t x = svget2(coords, 0); - svuint32_t y = svget2(coords, 1); - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), - svcmple_u32(pg, y, sv_ymax)); - svuint32_t result = load_common( - in_range, x, y, sv_src_stride, src_rows); - // Select between source pixels and border colour - return svsel_u32(in_range, result, sv_border); - } else { - static_assert(Border == KLEIDICV_BORDER_TYPE_REPLICATE); - return load_common(pg, x, y, sv_src_stride, - src_rows); - } - }; - - auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) { - svfloat32x2_t coords = load_coords(pg32, x); - svfloat32_t xf = svget2(coords, 0); - svfloat32_t yf = svget2(coords, 1); - - svuint32_t xi, yi; - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - // Round to the nearest integer - xi = svreinterpret_u32_s32( - svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf))); - yi = svreinterpret_u32_s32( - svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf))); - } else { - // Round to the nearest integer, clamp it to within the dimensions of - // the source image (negative values are already saturated to 0) - xi = svmin_x(pg_all32, - svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), - sv_xmax); - yi = svmin_x(pg_all32, - svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), - sv_ymax); - } - return svcreate2(xi, yi); - }; - - LoopUnroll2 loop{dst_width, kStep}; - - if constexpr (std::is_same::value) { - auto vector_path_generic = [&](size_t x, size_t x_max, - Columns dst) { - size_t length = x_max - x; - svbool_t pg32 = svwhilelt_b32(0ULL, length); - svuint32_t result = - get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); - svst1b_u32(pg32, &dst[static_cast(x)], result); - }; - - loop.unroll_four_times([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res32_0 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - svuint32_t res32_1 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - x += kStep; - res32_0 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - res32_1 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0), - svreinterpret_u8_u16(result1)); - svst1(svptrue_b8(), p_dst, result); - }); - loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); }); - loop.remaining( - [&](size_t x, size_t length) { vector_path_generic(x, length, dst); }); - } - - if constexpr (std::is_same::value) { - auto vector_path_generic = [&](size_t x, size_t x_max, - Columns dst) { - size_t length = x_max - x; - svbool_t pg32 = svwhilelt_b32(0ULL, length); - svuint32_t result = - get_pixels(pg32, calculate_nearest_coordinates(pg32, x)); - svst1h_u32(pg32, &dst[static_cast(x)], result); - }; - - loop.unroll_twice([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res32_0 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - svuint32_t res32_1 = - get_pixels(pg_all32, calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svst1(svptrue_b16(), p_dst, result); - }); - loop.unroll_once([&](size_t x) { vector_path_generic(x, x + kStep, dst); }); - loop.remaining( - [&](size_t x, size_t length) { vector_path_generic(x, length, dst); }); - } -} - -// TODO reduce functional complexity -template -void remap32f_process_rows(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType* border_value, - Rows dst_rows, size_t dst_width, - size_t y_begin, size_t y_end, - Rows mapx_rows, - Rows mapy_rows) { - svbool_t pg_all32 = svptrue_b32(); - svuint32_t sv_xmax = svdup_n_u32(src_width - 1); - svuint32_t sv_ymax = svdup_n_u32(src_height - 1); - svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); - svuint32_t sv_border = svdup_n_u32(0); - - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - sv_border = svdup_n_u32(border_value[0]); - } - - svfloat32_t xmaxf = svdup_n_f32(static_cast(src_width - 1)); - svfloat32_t ymaxf = svdup_n_f32(static_cast(src_height - 1)); - - const size_t kStep = VecTraits::num_lanes(); - - // auto get_coordinates = [&](svbool_t pg, size_t xs) { - auto coordinate_getter = [&](svbool_t pg, size_t xs) { - auto x = static_cast(xs); - return svcreate2(svld1_f32(pg, &mapx_rows.as_columns()[x]), - svld1_f32(pg, &mapy_rows.as_columns()[x])); - }; - - auto calculate_linear = [&](svbool_t pg, uint32_t x) { - if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_replicated_border( - pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); - } else { - static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); - svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_constant_border( - pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); - } - }; - - for (size_t y = y_begin; y < y_end; ++y) { - Columns dst = dst_rows.as_columns(); - LoopUnroll2 loop{dst_width, kStep}; - if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { - remap32f_nearest( - sv_xmax, sv_ymax, sv_src_stride, src_rows, sv_border, dst, kStep, - dst_width, mapx_rows, mapy_rows); - } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { - if constexpr (std::is_same::value) { - loop.unroll_four_times([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - x += kStep; - res0 = calculate_linear(pg_all32, x); - x += kStep; - res1 = calculate_linear(pg_all32, x); - svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - svst1_u8(svptrue_b8(), p_dst, - svuzp1_u8(svreinterpret_u8_u16(result16_0), - svreinterpret_u8_u16(result16_1))); - }); - } else if constexpr (std::is_same::value) { - loop.unroll_twice([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - svst1_u16(svptrue_b16(), p_dst, result16); - }); - } - loop.unroll_once([&](size_t x) { - ScalarType* p_dst = &dst[static_cast(x)]; - svuint32_t result = calculate_linear(pg_all32, x); - if constexpr (std::is_same::value) { - svst1b_u32(pg_all32, p_dst, result); - } - if constexpr (std::is_same::value) { - svst1h_u32(pg_all32, p_dst, result); - } - }); - loop.remaining([&](size_t x, size_t x_max) { - ScalarType* p_dst = &dst[static_cast(x)]; - svbool_t pg32 = svwhilelt_b32(x, x_max); - svuint32_t result = calculate_linear(pg32, x); - if constexpr (std::is_same::value) { - svst1b_u32(pg32, p_dst, result); - } - if constexpr (std::is_same::value) { - svst1h_u32(pg32, p_dst, result); - } - }); - } else { - static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || - Inter == KLEIDICV_INTERPOLATION_LINEAR, - ": Unknown interpolation type!"); - } - ++mapx_rows; - ++mapy_rows; - ++dst_rows; - } -} // NOLINTEND(readability-function-cognitive-complexity) -// Most of the complexity comes from parameter checking. -// NOLINTBEGIN(readability-function-cognitive-complexity) -template -kleidicv_error_t remap_f32_sc(const T* src, size_t src_stride, size_t src_width, - size_t src_height, T* dst, size_t dst_stride, - size_t dst_width, size_t dst_height, - size_t channels, const float* mapx, - size_t mapx_stride, const float* mapy, - size_t mapy_stride, - kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t border_type, - [[maybe_unused]] const T* border_value) { - // may need to remove the maybe_unused - CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); - CHECK_POINTER_AND_STRIDE(mapx, mapx_stride, dst_height); - CHECK_POINTER_AND_STRIDE(mapy, mapy_stride, dst_height); - CHECK_IMAGE_SIZE(src_width, src_height); - CHECK_IMAGE_SIZE(dst_width, dst_height); - if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { - return KLEIDICV_ERROR_NULL_POINTER; - } - - if (!remap_f32_is_implemented(src_stride, src_width, src_height, dst_width, - border_type, channels, interpolation)) { - return KLEIDICV_ERROR_NOT_IMPLEMENTED; - } - - // Calculating in float32_t will only be precise until 24 bits - if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || - dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24)) { - return KLEIDICV_ERROR_RANGE; - } - - Rows src_rows{src, src_stride, channels}; - Rows mapx_rows{mapx, mapx_stride, 1}; - Rows mapy_rows{mapy, mapy_stride, 1}; - Rows dst_rows{dst, dst_stride, channels}; - Rectangle rect{dst_width, dst_height}; - - remap32f_process_rows(remap_image_is_large(src_rows, src_height), - interpolation, border_type, src_rows, src_width, - src_height, border_value, dst_rows, dst_width, 0, - dst_height, mapx_rows, mapy_rows); - - return KLEIDICV_OK; -} -// NOLINTEND(readability-function-cognitive-complexity) +#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5( \ + const type* src, size_t src_stride, size_t src_width, size_t src_height, \ + type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ + size_t channels, const int16_t* mapxy, size_t mapxy_stride, \ + const uint16_t* mapfrac, size_t mapfrac_stride, \ + kleidicv_border_type_t border_type, const type* border_value) -} // namespace KLEIDICV_TARGET_NAMESPACE +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); -#endif // KLEIDICV_REMAP_SC_H +} // namespace kleidicv::sve2 diff --git a/kleidicv/src/transform/remap_sve2.cpp b/kleidicv/src/transform/remap_sve2.cpp deleted file mode 100644 index 155b8d88c..000000000 --- a/kleidicv/src/transform/remap_sve2.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#include "remap_sc.h" - -namespace kleidicv::sve2 { - -template -kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, - size_t src_height, T *dst, size_t dst_stride, - size_t dst_width, size_t dst_height, size_t channels, - const int16_t *mapxy, size_t mapxy_stride, - kleidicv_border_type_t border_type, - const T *border_value) { - return remap_s16_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height, channels, mapxy, - mapxy_stride, border_type, border_value); -} - -template -kleidicv_error_t remap_s16point5(const T *src, size_t src_stride, - size_t src_width, size_t src_height, T *dst, - size_t dst_stride, size_t dst_width, - size_t dst_height, size_t channels, - const int16_t *mapxy, size_t mapxy_stride, - const uint16_t *mapfrac, size_t mapfrac_stride, - kleidicv_border_type_t border_type, - const T *border_value) { - return remap_s16point5_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height, channels, - mapxy, mapxy_stride, mapfrac, mapfrac_stride, - border_type, border_value); -} - -template -kleidicv_error_t remap_f32(const T *src, size_t src_stride, size_t src_width, - size_t src_height, T *dst, size_t dst_stride, - size_t dst_width, size_t dst_height, size_t channels, - const float *mapx, size_t mapx_stride, - const float *mapy, size_t mapy_stride, - kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t border_type, - const T *border_value) { - return remap_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height, channels, mapx, - mapx_stride, mapy, mapy_stride, interpolation, - border_type, border_value); -} - -#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ - template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16( \ - const type *src, size_t src_stride, size_t src_width, size_t src_height, \ - type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ - kleidicv_border_type_t border_type, const type *border_value) - -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t); -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t); - -#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ - template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5( \ - const type *src, size_t src_stride, size_t src_width, size_t src_height, \ - type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ - const uint16_t *mapfrac, size_t mapfrac_stride, \ - kleidicv_border_type_t border_type, const type *border_value) - -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); - -#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(type) \ - template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_f32( \ - const type *src, size_t src_stride, size_t src_width, size_t src_height, \ - type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t channels, const float *mapx, size_t mapx_stride, \ - const float *mapy, size_t mapy_stride, \ - kleidicv_interpolation_type_t interpolation, \ - kleidicv_border_type_t border_type, const type *border_value) - -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint8_t); -KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_F32(uint16_t); - -} // namespace kleidicv::sve2 diff --git a/kleidicv/src/transform/transform_common.h b/kleidicv/src/transform/transform_common.h new file mode 100644 index 000000000..eaf005f66 --- /dev/null +++ b/kleidicv/src/transform/transform_common.h @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "kleidicv/types.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +bool is_image_large(const Rows &rows, size_t height) { + return rows.stride() * height >= 1ULL << 32; +} + +// Convert border_type to a template argument. +template +void transform_operation(kleidicv_border_type_t border_type, Args &&...args) { + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { + transform_operation( + std::forward(args)...); + } else { + transform_operation( + std::forward(args)...); + } +} + +// Convert interpolation_type to a template argument. +template +void transform_operation(kleidicv_interpolation_type_t interpolation_type, + Args &&...args) { + if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) { + transform_operation( + std::forward(args)...); + } else { + transform_operation( + std::forward(args)...); + } +} + +// Convert is_large to a template argument. +template +void transform_operation(bool is_large, Args &&...args) { + if (KLEIDICV_UNLIKELY(is_large)) { + transform_operation(std::forward(args)...); + } else { + transform_operation(std::forward(args)...); + } +} + +} // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/kleidicv/src/transform/transform_neon.h b/kleidicv/src/transform/transform_neon.h new file mode 100644 index 000000000..afcc72da8 --- /dev/null +++ b/kleidicv/src/transform/transform_neon.h @@ -0,0 +1,256 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "kleidicv/neon.h" +#include "kleidicv/types.h" +#include "transform_common.h" + +namespace kleidicv::neon { + +typedef struct { + float32x4_t x, y; +} FloatVectorPair; + +template +float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride, + Rows& src_rows) { + if constexpr (IsLarge) { + uint64x2_t offset_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_stride)); + uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); + uint64_t acc = + static_cast(src_rows[vgetq_lane_u64(offset_low, 0)]) | + (static_cast(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = + static_cast(src_rows[vgetq_lane_u64(offset_high, 0)]) | + (static_cast(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + } else { + uint32x4_t offset = vmlaq_u32(x, y, v_src_stride); + uint64_t acc = + static_cast(src_rows[vgetq_lane_u32(offset, 0)]) | + (static_cast(src_rows[vgetq_lane_u32(offset, 1)]) << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = static_cast(src_rows[vgetq_lane_u32(offset, 2)]) | + (static_cast(src_rows[vgetq_lane_u32(offset, 3)]) << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + } +} + +template +float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y, + uint32x4_t in_range, + ScalarType border_value, + uint32x4_t v_src_stride, + Rows src_rows) { + if constexpr (IsLarge) { + uint64x2_t offset_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_stride)); + uint64x2_t offset_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); + uint64_t pixel0 = vgetq_lane_u32(in_range, 0) + ? src_rows[vgetq_lane_u64(offset_low, 0)] + : border_value; + uint64_t pixel1 = vgetq_lane_u32(in_range, 1) + ? src_rows[vgetq_lane_u64(offset_low, 1)] + : border_value; + uint64_t pixel2 = vgetq_lane_u32(in_range, 2) + ? src_rows[vgetq_lane_u64(offset_high, 0)] + : border_value; + uint64_t pixel3 = vgetq_lane_u32(in_range, 3) + ? src_rows[vgetq_lane_u64(offset_high, 1)] + : border_value; + uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); + rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); + rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + } else { + uint32x4_t offset = vmlaq_u32(x, y, v_src_stride); + uint64_t pixel0 = vgetq_lane_u32(in_range, 0) + ? src_rows[vgetq_lane_u32(offset, 0)] + : border_value; + uint64_t pixel1 = vgetq_lane_u32(in_range, 1) + ? src_rows[vgetq_lane_u32(offset, 1)] + : border_value; + uint64_t pixel2 = vgetq_lane_u32(in_range, 2) + ? src_rows[vgetq_lane_u32(offset, 2)] + : border_value; + uint64_t pixel3 = vgetq_lane_u32(in_range, 3) + ? src_rows[vgetq_lane_u32(offset, 3)] + : border_value; + uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); + rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); + rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + } +} + +template +void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax, + uint32x4_t v_ymax, uint32x4_t v_src_stride, + Rows src_rows, + float32x4_t& xfrac, float32x4_t& yfrac, + float32x4_t& a, float32x4_t& b, float32x4_t& c, + float32x4_t& d) { + auto&& [xf, yf] = xy; + // Truncating convert to int + uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); + uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); + + // Get fractional part, or 0 if out of range + float32x4_t zero = vdupq_n_f32(0.F); + uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); + uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); + xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + + // x1 = x0 + 1, except if it's already xmax or out of range + uint32x4_t x1 = vsubq_u32(x0, x_in_range); + uint32x4_t y1 = vsubq_u32(y0, y_in_range); + + // a: top left, b: top right, c: bottom left, d: bottom right + a = load_xy(x0, y0, v_src_stride, src_rows); + b = load_xy(x1, y0, v_src_stride, src_rows); + c = load_xy(x0, y1, v_src_stride, src_rows); + d = load_xy(x1, y1, v_src_stride, src_rows); +} + +template +void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax, + uint32x4_t v_ymax, uint32x4_t v_src_stride, + ScalarType border_value, + Rows src_rows, + float32x4_t& xfrac, float32x4_t& yfrac, + float32x4_t& a, float32x4_t& b, float32x4_t& c, + float32x4_t& d) { + auto&& [xf, yf] = xy; + // Convert coordinates to integers, truncating towards minus infinity. + // Negative numbers will become large positive numbers. + // Since the source width and height is known to be <=2^24 these large + // positive numbers will always be treated as outside the source image + // bounds. + uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf)); + uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf)); + uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1)); + uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1)); + xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range; + { + uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax); + uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax); + uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax); + uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax); + a_in_range = vandq(x0_in_range, y0_in_range); + b_in_range = vandq(x1_in_range, y0_in_range); + c_in_range = vandq(x0_in_range, y1_in_range); + d_in_range = vandq(x1_in_range, y1_in_range); + } + a = load_xy_or_border(x0, y0, a_in_range, border_value, + v_src_stride, src_rows); + b = load_xy_or_border(x1, y0, b_in_range, border_value, + v_src_stride, src_rows); + c = load_xy_or_border(x0, y1, c_in_range, border_value, + v_src_stride, src_rows); + d = load_xy_or_border(x1, y1, d_in_range, border_value, + v_src_stride, src_rows); +} + +inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a, + float32x4_t b, float32x4_t c, float32x4_t d) { + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vcvtaq_u32_f32(result); +} + +template +void transform_pixels_replicate(float32x4_t xf, float32x4_t yf, + uint32x4_t v_xmax, uint32x4_t v_ymax, + uint32x4_t v_src_stride, + Rows src_rows, + Columns dst) { + // Round to nearest, with Ties To Away (i.e. round 0.5 up) + // Clamp coordinates to within the dimensions of the source image + // (vcvtaq already converted negative values to 0) + uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax); + uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax); + + // Copy pixels from source + if constexpr (IsLarge) { + uint64x2_t indices_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); + dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)]; + dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)]; + dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)]; + dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)]; + } else { + uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); + dst[0] = src_rows[vgetq_lane_u32(indices, 0)]; + dst[1] = src_rows[vgetq_lane_u32(indices, 1)]; + dst[2] = src_rows[vgetq_lane_u32(indices, 2)]; + dst[3] = src_rows[vgetq_lane_u32(indices, 3)]; + } +} + +template +void transform_pixels_constant(float32x4_t xf, float32x4_t yf, + uint32x4_t v_xmax, uint32x4_t v_ymax, + uint32x4_t v_src_stride, + Rows src_rows, + Columns dst, + ScalarType border_value) { + // Convert coordinates to integers. + // Negative numbers will become large positive numbers. + // Since the source width and height is known to be <=2^24 these large + // positive numbers will always be treated as outside the source image + // bounds. + uint32x4_t x = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf)); + uint32x4_t y = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf)); + uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); + + // Copy pixels from source + if constexpr (IsLarge) { + uint64x2_t indices_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), + vget_low_u32(v_src_stride)); + uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), + vget_low_u32(v_src_stride)); + dst[0] = vgetq_lane_u32(in_range, 0) + ? src_rows[vgetq_lane_u64(indices_low, 0)] + : border_value; + dst[1] = vgetq_lane_u32(in_range, 1) + ? src_rows[vgetq_lane_u64(indices_low, 1)] + : border_value; + dst[2] = vgetq_lane_u32(in_range, 2) + ? src_rows[vgetq_lane_u64(indices_high, 0)] + : border_value; + dst[3] = vgetq_lane_u32(in_range, 3) + ? src_rows[vgetq_lane_u64(indices_high, 1)] + : border_value; + } else { + uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); + dst[0] = vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(indices, 0)] + : border_value; + dst[1] = vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(indices, 1)] + : border_value; + dst[2] = vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(indices, 2)] + : border_value; + dst[3] = vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(indices, 3)] + : border_value; + } +} + +} // namespace kleidicv::neon diff --git a/kleidicv/src/transform/common_sc.h b/kleidicv/src/transform/transform_sve2.h similarity index 60% rename from kleidicv/src/transform/common_sc.h rename to kleidicv/src/transform/transform_sve2.h index 17ac56866..74b6d889a 100644 --- a/kleidicv/src/transform/common_sc.h +++ b/kleidicv/src/transform/transform_sve2.h @@ -18,56 +18,14 @@ #include "kleidicv/sve2.h" #include "kleidicv/traits.h" #include "kleidicv/types.h" +#include "transform_common.h" -namespace KLEIDICV_TARGET_NAMESPACE { - -// Convert border_type to a template argument. -template -void remap32f_process_rows(kleidicv_border_type_t border_type, Args &&...args) { - if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { - remap32f_process_rows( - std::forward(args)...); - } else { - remap32f_process_rows( - std::forward(args)...); - } -} - -// Convert interpolation_type to a template argument. -template -void remap32f_process_rows(kleidicv_interpolation_type_t interpolation_type, - Args &&...args) { - if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) { - remap32f_process_rows( - std::forward(args)...); - } else { - remap32f_process_rows( - std::forward(args)...); - } -} - -template -bool remap_image_is_large(const Rows &rows, size_t height) { - return rows.stride() * height >= 1ULL << 32; -} - -// Convert is_large to a template argument. -template -void remap32f_process_rows(bool is_large, Args &&...args) { - if (KLEIDICV_UNLIKELY(is_large)) { - remap32f_process_rows(std::forward(args)...); - } else { - remap32f_process_rows(std::forward(args)...); - } -} +namespace kleidicv::sve2 { template -svuint32_t inline load_common(svbool_t pg, svuint32_t x, svuint32_t y, - svuint32_t sv_src_stride, - Rows &src_rows) { +svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y, + svuint32_t sv_src_stride, + Rows &src_rows) { if constexpr (std::is_same::value) { if constexpr (IsLarge) { svbool_t pg_b = pg; @@ -115,7 +73,7 @@ svuint32_t inline calculate_linear_replicated_border( svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf, svuint32_t sv_src_stride, Rows &src_rows) { auto load_source = [&](svuint32_t x, svuint32_t y) { - return load_common(pg, x, y, sv_src_stride, src_rows); + return load_xy(pg, x, y, sv_src_stride, src_rows); }; svbool_t pg_all32 = svptrue_b32(); svfloat32_t xf = svget2(coords, 0); @@ -164,7 +122,7 @@ svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y, svbool_t in_range = svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); svuint32_t result = - load_common(in_range, x, y, sv_src_stride, src_rows); + load_xy(in_range, x, y, sv_src_stride, src_rows); // Select between source pixels and border colour return svsel_u32(in_range, result, sv_border); } @@ -174,48 +132,44 @@ svuint32_t inline calculate_linear_constant_border( svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax, svuint32_t sv_ymax, svuint32_t sv_src_stride, Rows &src_rows) { - svfloat32_t xf = svget2(coords, 0); - svfloat32_t yf = svget2(coords, 1); - - // Convert obviously out-of-range coordinates to values that are just beyond - // the largest permitted image width & height. This avoids the need for - // special case handling elsewhere. - svfloat32_t big = svdup_n_f32(1 << 24); - xf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, xf), big), xf, big); - yf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, yf), big), yf, big); - - svfloat32_t xf0 = svrintm_f32_x(pg, xf); - svfloat32_t yf0 = svrintm_f32_x(pg, yf); - - svint32_t x0 = svcvt_s32_x(pg, xf0); - svint32_t y0 = svcvt_s32_x(pg, yf0); - svint32_t x1 = svadd_s32_x(pg, x0, svdup_n_s32(1)); - svint32_t y1 = svadd_s32_x(pg, y0, svdup_n_s32(1)); - - svfloat32_t xfrac = svsub_f32_x(pg, xf, xf0); - svfloat32_t yfrac = svsub_f32_x(pg, yf, yf0); + // Convert coordinates to integers, truncating towards minus infinity. + // Negative numbers will become large positive numbers. + // Since the source width and height is known to be <=2^24 these large + // positive numbers will always be treated as outside the source image + // bounds. + svuint32_t x0, y0, x1, y1; + svfloat32_t xfrac, yfrac; + { + svfloat32_t xf = svget2(coords, 0); + svfloat32_t yf = svget2(coords, 1); + svfloat32_t xf0 = svrintm_f32_x(pg, xf); + svfloat32_t yf0 = svrintm_f32_x(pg, yf); + x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0)); + y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0)); + x1 = svadd_u32_x(pg, x0, svdup_n_u32(1)); + y1 = svadd_u32_x(pg, y0, svdup_n_u32(1)); + + xfrac = svsub_f32_x(pg, xf, xf0); + yfrac = svsub_f32_x(pg, yf, yf0); + } - svfloat32_t a = svcvt_f32_u32_x( - pg, get_pixels_or_border( - pg, svreinterpret_u32_s32(x0), svreinterpret_u32_s32(y0), - sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows)); - svfloat32_t b = svcvt_f32_u32_x( - pg, get_pixels_or_border( - pg, svreinterpret_u32_s32(x1), svreinterpret_u32_s32(y0), - sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows)); + svfloat32_t a = svcvt_f32_u32_x(pg, get_pixels_or_border( + pg, x0, y0, sv_border, sv_xmax, + sv_ymax, sv_src_stride, src_rows)); + svfloat32_t b = svcvt_f32_u32_x(pg, get_pixels_or_border( + pg, x1, y0, sv_border, sv_xmax, + sv_ymax, sv_src_stride, src_rows)); svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac); - svfloat32_t c = svcvt_f32_u32_x( - pg, get_pixels_or_border( - pg, svreinterpret_u32_s32(x0), svreinterpret_u32_s32(y1), - sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows)); - svfloat32_t d = svcvt_f32_u32_x( - pg, get_pixels_or_border( - pg, svreinterpret_u32_s32(x1), svreinterpret_u32_s32(y1), - sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows)); + svfloat32_t c = svcvt_f32_u32_x(pg, get_pixels_or_border( + pg, x0, y1, sv_border, sv_xmax, + sv_ymax, sv_src_stride, src_rows)); + svfloat32_t d = svcvt_f32_u32_x(pg, get_pixels_or_border( + pg, x1, y1, sv_border, sv_xmax, + sv_ymax, sv_src_stride, src_rows)); svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac); svfloat32_t result = svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac); return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result)); } -} // namespace KLEIDICV_TARGET_NAMESPACE +} // namespace kleidicv::sve2 diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp index c42bc0ef2..dd7494523 100644 --- a/kleidicv/src/transform/warp_perspective_neon.cpp +++ b/kleidicv/src/transform/warp_perspective_neon.cpp @@ -7,11 +7,11 @@ #include #include "kleidicv/ctypes.h" -#include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" #include "kleidicv/traits.h" #include "kleidicv/transform/warp_perspective.h" #include "kleidicv/utils.h" +#include "transform_neon.h" namespace kleidicv::neon { @@ -33,161 +33,17 @@ namespace kleidicv::neon { // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) // -namespace { - -typedef struct { - float32x4_t x, y; -} CoordVectorPair; - -template -inline uint32x4_t get_pixels_or_border_large(uint32x4_t x, uint32x4_t y, - uint32x4_t v_xmax, - uint32x4_t v_ymax, - uint32x2_t v_src_stride, - Rows src_rows, - const ScalarType *border_value) { - uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits - uint64x2_t offsets_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride); - uint64x2_t offsets_high = - vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride); - // Copy pixels from source - uint32_t pixel_data[4] = { - vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u64(offsets_low, 0)] - : border_value[0], - vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u64(offsets_low, 1)] - : border_value[0], - vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u64(offsets_high, 0)] - : border_value[0], - vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u64(offsets_high, 1)] - : border_value[0], - }; - return vld1q_u32(pixel_data); -} - -template -inline uint32x4_t get_pixels_or_border_small(uint32x4_t x, uint32x4_t y, - uint32x4_t v_xmax, - uint32x4_t v_ymax, - uint32x4_t v_src_stride, - Rows src_rows, - const ScalarType *border_value) { - uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); - // Calculate offsets from coordinates (y * stride + x) - // Use this path only when the final offsets fit into 32 bits - uint32x4_t offsets = vmlaq_u32(x, y, v_src_stride); - // Copy pixels from source - uint32_t pixel_data[4] = { - vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(offsets, 0)] - : border_value[0], - vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(offsets, 1)] - : border_value[0], - vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(offsets, 2)] - : border_value[0], - vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(offsets, 3)] - : border_value[0], - }; - return vld1q_u32(pixel_data); -} - -template -inline void get_edge_pixels(unsigned &a_result, unsigned &b_result, - unsigned &c_result, unsigned &d_result, int x0, - int y0, ptrdiff_t offset, - Rows src_rows, int src_width, - int src_height) { - if (y0 >= 0) { - if (x0 >= 0) { - a_result = src_rows[offset]; - } - if (x0 + 1 < src_width) { - b_result = src_rows[offset + 1]; - } - } - if (y0 + 1 < src_height) { - offset += src_rows.stride(); - if (x0 >= 0) { - c_result = src_rows[offset]; - } - if (x0 + 1 < src_width) { - d_result = src_rows[offset + 1]; - } - } -} - -template -inline uint32x4_t calculate_linear_constant_border( - float32x4_t xf, float32x4_t yf, Rows src_rows, - int src_width, int src_height, const ScalarType *border_value) { - // Convert obviously out-of-range coordinates to values that are just beyond - // the largest permitted image width & height. This avoids the need for - // special case handling elsewhere. - float32x4_t big = vdupq_n_f32(1 << 24); - xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big); - yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big); - - int32x4_t x0 = vcvtmq_s32_f32(xf); - int32x4_t y0 = vcvtmq_s32_f32(yf); - int x0_array[4], y0_array[4]; - unsigned a_array[4], b_array[4], c_array[4], d_array[4]; - vst1q_s32(x0_array, x0); - vst1q_s32(y0_array, y0); - for (int i = 0; i < 4; ++i) { - int x0i = x0_array[i]; - int y0i = y0_array[i]; - ptrdiff_t offset = x0i + y0i * src_rows.stride(); - - if (x0i < 0 || x0i + 1 >= src_width || y0i < 0 || y0i + 1 >= src_height) { - // Not entirely within the source image - - a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value[0]; - - if (x0i < -1 || x0i >= src_width || y0i < -1 || y0i >= src_height) { - // Completely outside the source image - continue; - } - - get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, y0i, - offset, src_rows, src_width, src_height); - continue; - } - - // Completely inside the source image - a_array[i] = src_rows[offset]; - b_array[i] = src_rows[offset + 1]; - offset += src_rows.stride(); - c_array[i] = src_rows[offset]; - d_array[i] = src_rows[offset + 1]; - } - - float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf)); - float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf)); - float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array)); - float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array)); - float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); - float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array)); - float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array)); - float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); - float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); - return vcvtaq_u32_f32(result); -} - template -void warp_perspective_operation(Rows src_rows, - size_t src_width, size_t src_height, - const float transform[9], - const ScalarType *border_value, - Rows dst_rows, size_t dst_width, - size_t y_begin, size_t y_end) { +void transform_operation(Rows src_rows, size_t src_width, + size_t src_height, const float transform[9], + const ScalarType *border_value, + Rows dst_rows, size_t dst_width, + size_t y_begin, size_t y_end) { static constexpr uint32_t first_few_x[] = {0, 1, 2, 3}; uint32x4_t x0123_ = vld1q_u32(first_few_x); - uint32x2_t v_src_stride = - vdup_n_u32(static_cast(src_rows.stride())); - uint32x4_t vq_src_stride = + uint32x4_t v_src_stride = vdupq_n_u32(static_cast(src_rows.stride())); uint32x4_t v_xmax = vdupq_n_u32(static_cast(src_width - 1)); uint32x4_t v_ymax = vdupq_n_u32(static_cast(src_height - 1)); @@ -205,154 +61,27 @@ void warp_perspective_operation(Rows src_rows, // Calculate coordinates into the source image float32x4_t xf = vmulq_f32(tx, iw); float32x4_t yf = vmulq_f32(ty, iw); - return CoordVectorPair{xf, yf}; - }; - - auto vector_path_nearest_small = [&](uint32_t x, Columns dst) { - auto &&[xf, yf] = calculate_coordinates(x); - // Take the integer part, clamp it to within the dimensions of the - // source image (negative values are already saturated to 0) - uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf)); - uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf)); - // Calculate offsets from coordinates (y * stride + x) - // Use this path only when the final offsets fit into 32 bits - uint32x4_t offsets = vmlaq_u32(xi, yi, vq_src_stride); - // Copy pixels from source - ptrdiff_t ix = static_cast(x); - dst[ix] = src_rows[vgetq_lane_u32(offsets, 0)]; - dst[ix + 1] = src_rows[vgetq_lane_u32(offsets, 1)]; - dst[ix + 2] = src_rows[vgetq_lane_u32(offsets, 2)]; - dst[ix + 3] = src_rows[vgetq_lane_u32(offsets, 3)]; - }; - - auto vector_path_nearest_large = [&](uint32_t x, Columns dst) { - auto &&[xf, yf] = calculate_coordinates(x); - // Take the integer part, clamp it to within the dimensions of the - // source image (negative values are already saturated to 0) - uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf)); - uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf)); - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits - uint64x2_t offsets_low = - vmlal_u32(vmovl_u32(vget_low_u32(xi)), vget_low_u32(yi), v_src_stride); - uint64x2_t offsets_high = - vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride); - // Copy pixels from source - ptrdiff_t ix = static_cast(x); - dst[ix] = src_rows[vgetq_lane_u64(offsets_low, 0)]; - dst[ix + 1] = src_rows[vgetq_lane_u64(offsets_low, 1)]; - dst[ix + 2] = src_rows[vgetq_lane_u64(offsets_high, 0)]; - dst[ix + 3] = src_rows[vgetq_lane_u64(offsets_high, 1)]; - }; - - auto vector_path_nearest_constant_border = [&](uint32_t x, - Columns dst) { - auto &&[xf, yf] = calculate_coordinates(x); - // Convert coordinates to integers. - // Negative numbers will become large positive numbers. - // Since the source width and height is known to be <=2^24 these large - // positive numbers will always be treated as outside the source image - // bounds. - uint32x4_t xi = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf)); - uint32x4_t yi = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf)); - uint32x4_t pixels; - if constexpr (IsLarge) { - pixels = get_pixels_or_border_large(xi, yi, v_xmax, v_ymax, v_src_stride, - src_rows, border_value); - } else { - pixels = get_pixels_or_border_small(xi, yi, v_xmax, v_ymax, vq_src_stride, - src_rows, border_value); - } - - ptrdiff_t ix = static_cast(x); - dst[ix] = static_cast(vgetq_lane_u32(pixels, 0)); - dst[ix + 1] = static_cast(vgetq_lane_u32(pixels, 1)); - dst[ix + 2] = static_cast(vgetq_lane_u32(pixels, 2)); - dst[ix + 3] = static_cast(vgetq_lane_u32(pixels, 3)); - }; - - auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { - uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride); - uint64_t acc = - static_cast(src_rows[vgetq_lane_u32(offset, 0)]) | - (static_cast(src_rows[vgetq_lane_u32(offset, 1)]) << 32); - uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = static_cast(src_rows[vgetq_lane_u32(offset, 2)]) | - (static_cast(src_rows[vgetq_lane_u32(offset, 3)]) << 32); - rawsrc = vsetq_lane_u64(acc, rawsrc, 1); - return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); - }; - - auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) { - uint64x2_t offset_low = - vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride); - uint64x2_t offset_high = - vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride); - uint64_t acc = - static_cast(src_rows[vgetq_lane_u64(offset_low, 0)]) | - (static_cast(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32); - uint64x2_t rawsrc = vdupq_n_u64(acc); - acc = - static_cast(src_rows[vgetq_lane_u64(offset_high, 0)]) | - (static_cast(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32); - rawsrc = vsetq_lane_u64(acc, rawsrc, 1); - return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); - }; - - auto calculate_linear_replicated_border = [&](uint32_t x) { - auto load_floats = [&](uint32x4_t x, uint32x4_t y) { - if constexpr (IsLarge) { - return load_src_into_floats_large(x, y); - } else { - return load_src_into_floats_small(x, y); - } - }; - auto &&[xf, yf] = calculate_coordinates(x); - // Truncating convert to int - uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); - uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); - - // Get fractional part, or 0 if out of range - float32x4_t zero = vdupq_n_f32(0.F); - uint32x4_t x_in_range = - vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); - uint32x4_t y_in_range = - vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); - float32x4_t xfrac = - vbslq_f32(x_in_range, vsubq_f32(xf, vrndmq_f32(xf)), zero); - float32x4_t yfrac = - vbslq_f32(y_in_range, vsubq_f32(yf, vrndmq_f32(yf)), zero); - - // x1 = x0 + 1, except if it's already xmax or out of range - uint32x4_t x1 = vsubq_u32(x0, x_in_range); - uint32x4_t y1 = vsubq_u32(y0, y_in_range); - - // Calculate offsets from coordinates (y * stride + x) - // a: top left, b: top right, c: bottom left, d: bottom right - float32x4_t a = load_floats(x0, y0); - float32x4_t b = load_floats(x1, y0); - float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); - float32x4_t c = load_floats(x0, y1); - float32x4_t d = load_floats(x1, y1); - float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); - float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); - return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result)); + return FloatVectorPair{xf, yf}; }; auto calculate_linear = [&](uint32_t x) { + float32x4_t a, b, c, d, xfrac, yfrac; if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - return calculate_linear_replicated_border(x); + load_quad_pixels_replicate( + calculate_coordinates(x), v_xmax, v_ymax, v_src_stride, src_rows, + xfrac, yfrac, a, b, c, d); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); - auto &&[xf, yf] = calculate_coordinates(x); - return calculate_linear_constant_border( - xf, yf, src_rows, static_cast(src_width), - static_cast(src_height), border_value); + load_quad_pixels_constant( + calculate_coordinates(x), v_xmax, v_ymax, v_src_stride, + border_value[0], src_rows, xfrac, yfrac, a, b, c, d); } + return lerp_2d(xfrac, yfrac, a, b, c, d); }; - auto process_row = [&](uint32_t y, Columns dst) { + for (size_t y = y_begin; y < y_end; ++y) { float dy = static_cast(y); + Columns dst = dst_rows.as_columns(); // Calculate half-transformed values at the first pixel (nominators) // tw = T6*x + T7*y + T8 // tx = (T0*x + T1*y + T2) / tw @@ -364,20 +93,23 @@ void warp_perspective_operation(Rows src_rows, static const size_t kStep = VecTraits::num_lanes(); LoopUnroll2 loop{dst_width, kStep}; if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { loop.unroll_once([&](size_t x) { - vector_path_nearest_constant_border(static_cast(x), dst); - }); - } else if constexpr (IsLarge) { - loop.unroll_once([&](size_t x) { - vector_path_nearest_large(static_cast(x), dst); + auto &&[xf, yf] = calculate_coordinates(x); + transform_pixels_replicate( + xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x)); }); } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); loop.unroll_once([&](size_t x) { - vector_path_nearest_small(static_cast(x), dst); + auto &&[xf, yf] = calculate_coordinates(x); + transform_pixels_constant( + xf, yf, v_xmax, v_ymax, v_src_stride, src_rows, dst.at(x), + border_value[0]); }); } - } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { + } else { + static_assert(Inter == KLEIDICV_INTERPOLATION_LINEAR); loop.unroll_four_times([&](size_t _x) { uint32_t x = static_cast(_x); ScalarType *p_dst = &dst[static_cast(_x)]; @@ -400,54 +132,11 @@ void warp_perspective_operation(Rows src_rows, p_dst[2] = vgetq_lane_u32(res, 2); p_dst[3] = vgetq_lane_u32(res, 3); }); - } else { - static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || - Inter == KLEIDICV_INTERPOLATION_LINEAR, - ": Unknown interpolation type!"); } - }; - - for (size_t y = y_begin; y < y_end; ++y) { - process_row(y, dst_rows.as_columns()); ++dst_rows; } } -// Convert border_type to a template argument. -template -void warp_perspective_operation(kleidicv_border_type_t border_type, - Args &&...args) { - if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { - warp_perspective_operation( - std::forward(args)...); - } else { - warp_perspective_operation( - std::forward(args)...); - } -} - -// Convert interpolation_type to a template argument. -template -void warp_perspective_operation( - kleidicv_interpolation_type_t interpolation_type, Args &&...args) { - if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) { - warp_perspective_operation( - std::forward(args)...); - } else { - warp_perspective_operation( - std::forward(args)...); - } -} - -} // namespace - -// Most of the complexity comes from parameter checking. -// NOLINTBEGIN(readability-function-cognitive-complexity) template kleidicv_error_t warp_perspective_stripe( const T *src, size_t src_stride, size_t src_width, size_t src_height, @@ -467,9 +156,10 @@ kleidicv_error_t warp_perspective_stripe( // Calculating in float32_t will only be precise until 24 bits, and // multiplication can only be done with 32x32 bits + // Empty source image is not supported if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || - src_stride >= (1ULL << 32)) { + src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) { return KLEIDICV_ERROR_RANGE; } @@ -477,16 +167,10 @@ kleidicv_error_t warp_perspective_stripe( Rows dst_rows{dst, dst_stride, channels}; dst_rows += y_begin; - if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - warp_perspective_operation( - interpolation, border_type, src_rows, src_width, src_height, - transformation, border_value, dst_rows, dst_width, y_begin, y_end); - } else { - warp_perspective_operation( - interpolation, border_type, src_rows, src_width, src_height, - transformation, border_value, dst_rows, dst_width, y_begin, y_end); - } - // NOLINTEND(readability-function-cognitive-complexity) + transform_operation(is_image_large(src_rows, src_height), interpolation, + border_type, src_rows, src_width, src_height, + transformation, border_value, dst_rows, dst_width, + y_begin, y_end); return KLEIDICV_OK; } diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h deleted file mode 100644 index d75958b86..000000000 --- a/kleidicv/src/transform/warp_perspective_sc.h +++ /dev/null @@ -1,299 +0,0 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include - -#include "common_sc.h" -#include "kleidicv/ctypes.h" -#include "kleidicv/kleidicv.h" -#include "kleidicv/sve2.h" -#include "kleidicv/traits.h" -#include "kleidicv/transform/warp_perspective.h" -#include "kleidicv/types.h" - -namespace KLEIDICV_TARGET_NAMESPACE { - -// Gather load is not available in streaming mode, and in general random access -// is not recommended for SME. -#if !KLEIDICV_TARGET_SME2 - -// Template for WarpPerspective transformation. -// Destination pixels are filled from the source, by taking pixels using the -// transformed coordinates that are calculated as follows: -// -// [ T0, T1, T2 ] [ x ] -// (x',y',w') = [ T3, T4, T5 ] * [ y ] -// [ T6, T7, T8 ] [ 1 ] -// then -// -// xt = x' / w' -// yt = y' / w' -// -// or putting it together: -// -// xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8) -// yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) -// - -template -void remap32f_process_rows(Rows src_rows, size_t src_width, - size_t src_height, const ScalarType *border_value, - Rows dst_rows, size_t dst_width, - size_t y_begin, size_t y_end, - const float transform[9]) { - svbool_t pg_all32 = svptrue_b32(); - svuint32_t sv_xmax = svdup_n_u32(src_width - 1); - svuint32_t sv_ymax = svdup_n_u32(src_height - 1); - svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); - svuint32_t sv_border; - // sv_border is only used if the border type is constant. - // If the border type is not constant then border_value is permitted to be - // null and must not be read. - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - sv_border = svdup_n_u32(border_value[0]); - } - - svfloat32_t xmaxf = svdup_n_f32(static_cast(src_width - 1)); - svfloat32_t ymaxf = svdup_n_f32(static_cast(src_height - 1)); - - const size_t kStep = VecTraits::num_lanes(); - - svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1)); - svfloat32_t T0 = svdup_n_f32(transform[0]); - svfloat32_t T3 = svdup_n_f32(transform[3]); - svfloat32_t T6 = svdup_n_f32(transform[6]); - svfloat32_t tx0, ty0, tw0; - - auto coordinate_getter = [&](svbool_t, size_t x) { - svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast(x)); - // Calculate half-transformed values from the first few pixel values, - // plus Tn*x, similarly to the one above - // Calculate inverse weight because division is expensive - svfloat32_t iw = - svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6)); - svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0); - svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3); - - // Calculate coordinates into the source image - return svcreate2(svmul_f32_x(pg_all32, tx, iw), - svmul_f32_x(pg_all32, ty, iw)); - }; - - auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) { - svfloat32x2_t coords = coordinate_getter(pg32, x); - svfloat32_t xf = svget2(coords, 0); - svfloat32_t yf = svget2(coords, 1); - - svuint32_t xi, yi; - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - // Round to the nearest integer - xi = svreinterpret_u32_s32( - svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf))); - yi = svreinterpret_u32_s32( - svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf))); - } else { - // Round to the nearest integer, clamp it to within the dimensions of the - // source image (negative values are already saturated to 0) - xi = svmin_x(pg_all32, - svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), - sv_xmax); - yi = svmin_x(pg_all32, - svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), - sv_ymax); - } - return svcreate2(xi, yi); - }; - - auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) { - svbool_t in_range = - svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); - svuint32_t result = load_common( - in_range, x, y, sv_src_stride, src_rows); - // Select between source pixels and border colour - return svsel_u32(in_range, result, sv_border); - }; - - auto vector_path_nearest_4x = [&](size_t x, Columns dst) { - auto load_source = [&](svuint32x2_t coords) { - svuint32_t x = svget2(coords, 0); - svuint32_t y = svget2(coords, 1); - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - return get_pixels_or_border(pg_all32, x, y); - } else { - return load_common(pg_all32, x, y, sv_src_stride, - src_rows); - } - }; - ScalarType *p_dst = &dst[static_cast(x)]; - svuint32_t res32_0 = - load_source(calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - svuint32_t res32_1 = - load_source(calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - x += kStep; - res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x)); - x += kStep; - res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x)); - svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svuint8_t result = - svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1)); - svst1(svptrue_b8(), p_dst, result); - }; - - auto vector_path_nearest_tail = [&](size_t x, size_t x_max, - Columns dst) { - size_t length = x_max - x; - svbool_t pg32 = svwhilelt_b32(0ULL, length); - - svuint32x2_t coords = calculate_nearest_coordinates(pg32, x); - svuint32_t xi = svget2(coords, 0); - svuint32_t yi = svget2(coords, 1); - - svuint32_t result; - if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - result = get_pixels_or_border(pg32, xi, yi); - } else { - result = load_common(pg32, xi, yi, sv_src_stride, - src_rows); - } - svst1b_u32(pg32, &dst[static_cast(x)], result); - }; - - auto calculate_linear = [&](svbool_t pg, uint32_t x) { - if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_replicated_border( - pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); - } else { - static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); - svfloat32x2_t coords = coordinate_getter(pg, x); - return calculate_linear_constant_border( - pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); - } - }; - - auto process_row = [&](size_t y) { - float fy = static_cast(y); - // Calculate half-transformed values at the first pixel (nominators) - // tw = T6*x + T7*y + T8 - // tx = (T0*x + T1*y + T2) / tw - // ty = (T3*x + T4*y + T5) / tw - tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2])); - ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5])); - tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8])); - - Columns dst = dst_rows.as_columns(); - LoopUnroll2 loop{dst_width, kStep}; - if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { - loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); }); - loop.unroll_once( - [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); }); - loop.remaining([&](size_t x, size_t length) { - vector_path_nearest_tail(x, length, dst); - }); - } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { - loop.unroll_four_times([&](size_t x) { - ScalarType *p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(pg_all32, x); - x += kStep; - svuint32_t res1 = calculate_linear(pg_all32, x); - svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - x += kStep; - res0 = calculate_linear(pg_all32, x); - x += kStep; - res1 = calculate_linear(pg_all32, x); - svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), - svreinterpret_u16_u32(res1)); - svst1_u8(svptrue_b8(), p_dst, - svuzp1_u8(svreinterpret_u8_u16(result16_0), - svreinterpret_u8_u16(result16_1))); - }); - loop.unroll_once([&](size_t x) { - ScalarType *p_dst = &dst[static_cast(x)]; - svuint32_t result = calculate_linear(pg_all32, x); - svst1b_u32(pg_all32, p_dst, result); - }); - loop.remaining([&](size_t x, size_t x_max) { - ScalarType *p_dst = &dst[static_cast(x)]; - svbool_t pg32 = svwhilelt_b32(x, x_max); - svuint32_t result = calculate_linear(pg32, x); - svst1b_u32(pg32, p_dst, result); - }); - } else { - static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || - Inter == KLEIDICV_INTERPOLATION_LINEAR, - ": Unknown interpolation type!"); - } - }; - - for (size_t y = y_begin; y < y_end; ++y) { - process_row(y); - ++dst_rows; - } -} - -// Most of the complexity comes from parameter checking. -// NOLINTBEGIN(readability-function-cognitive-complexity) -template -KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc( - const T *src, size_t src_stride, size_t src_width, size_t src_height, - T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t y_begin, size_t y_end, const float transform[9], size_t channels, - kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t border_type, const T *border_value) { - CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); - CHECK_POINTERS(transform); - CHECK_IMAGE_SIZE(src_width, src_height); - CHECK_IMAGE_SIZE(dst_width, dst_height); - if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { - return KLEIDICV_ERROR_NULL_POINTER; - } - - // Calculating in float32_t will only be precise until 24 bits, and - // multiplication can only be done with 32x32 bits - if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || - dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || - src_stride >= (1ULL << 32)) { - return KLEIDICV_ERROR_RANGE; - } - - Rows src_rows{src, src_stride, channels}; - Rows dst_rows{dst, dst_stride, channels}; - Rectangle rect{dst_width, dst_height}; - - dst_rows += y_begin; - - remap32f_process_rows(remap_image_is_large(src_rows, src_height), - interpolation, border_type, src_rows, src_width, - src_height, border_value, dst_rows, dst_width, - y_begin, y_end, transform); - - return KLEIDICV_OK; -} -// NOLINTEND(readability-function-cognitive-complexity) - -#define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(type) \ - template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \ - warp_perspective_stripe_sc( \ - const type *src, size_t src_stride, size_t src_width, size_t src_height, \ - type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ - size_t y_begin, size_t y_end, const float transformation[9], \ - size_t channels, kleidicv_interpolation_type_t interpolation, \ - kleidicv_border_type_t border_type, const type *border_value) - -KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(uint8_t); - -#endif - -} // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp index 5e3649050..b1f3df8d2 100644 --- a/kleidicv/src/transform/warp_perspective_sve2.cpp +++ b/kleidicv/src/transform/warp_perspective_sve2.cpp @@ -1,23 +1,275 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 +#include + +#include +#include + #include "kleidicv/ctypes.h" -#include "warp_perspective_sc.h" +#include "kleidicv/sve2.h" +#include "kleidicv/types.h" +#include "transform_sve2.h" namespace kleidicv::sve2 { +// Template for WarpPerspective transformation. +// Destination pixels are filled from the source, by taking pixels using the +// transformed coordinates that are calculated as follows: +// +// [ T0, T1, T2 ] [ x ] +// (x',y',w') = [ T3, T4, T5 ] * [ y ] +// [ T6, T7, T8 ] [ 1 ] +// then +// +// xt = x' / w' +// yt = y' / w' +// +// or putting it together: +// +// xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8) +// yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) +// + +template +void transform_operation(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value, + Rows dst_rows, size_t dst_width, + size_t y_begin, size_t y_end, + const float transform[9]) { + svbool_t pg_all32 = svptrue_b32(); + svuint32_t sv_xmax = svdup_n_u32(src_width - 1); + svuint32_t sv_ymax = svdup_n_u32(src_height - 1); + svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); + svuint32_t sv_border; + // sv_border is only used if the border type is constant. + // If the border type is not constant then border_value is permitted to be + // null and must not be read. + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + sv_border = svdup_n_u32(border_value[0]); + } + + svfloat32_t xmaxf = svdup_n_f32(static_cast(src_width - 1)); + svfloat32_t ymaxf = svdup_n_f32(static_cast(src_height - 1)); + + const size_t kStep = VecTraits::num_lanes(); + + svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1)); + svfloat32_t T0 = svdup_n_f32(transform[0]); + svfloat32_t T3 = svdup_n_f32(transform[3]); + svfloat32_t T6 = svdup_n_f32(transform[6]); + svfloat32_t tx0, ty0, tw0; + + auto calc_coords = [&](svbool_t, size_t x) { + svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast(x)); + // Calculate half-transformed values from the first few pixel values, + // plus Tn*x, similarly to the one above + // Calculate inverse weight because division is expensive + svfloat32_t iw = + svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6)); + svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0); + svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3); + + // Calculate coordinates into the source image + return svcreate2(svmul_f32_x(pg_all32, tx, iw), + svmul_f32_x(pg_all32, ty, iw)); + }; + + auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) { + svfloat32x2_t coords = calc_coords(pg32, x); + svfloat32_t xf = svget2(coords, 0); + svfloat32_t yf = svget2(coords, 1); + + svuint32_t xi, yi; + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + // Round to the nearest integer + xi = svreinterpret_u32_s32( + svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf))); + yi = svreinterpret_u32_s32( + svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf))); + } else { + // Round to the nearest integer, clamp it to within the dimensions of the + // source image (negative values are already saturated to 0) + xi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), + sv_xmax); + yi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), + sv_ymax); + } + return svcreate2(xi, yi); + }; + + auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) { + svbool_t in_range = + svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); + svuint32_t result = + load_xy(in_range, x, y, sv_src_stride, src_rows); + // Select between source pixels and border colour + return svsel_u32(in_range, result, sv_border); + }; + + auto vector_path_nearest_4x = [&](size_t x, Columns dst) { + auto load_source = [&](svuint32x2_t coords) { + svuint32_t x = svget2(coords, 0); + svuint32_t y = svget2(coords, 1); + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + return get_pixels_or_border(pg_all32, x, y); + } else { + return load_xy(pg_all32, x, y, sv_src_stride, + src_rows); + } + }; + ScalarType *p_dst = &dst[static_cast(x)]; + svuint32_t res32_0 = + load_source(calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + svuint32_t res32_1 = + load_source(calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + x += kStep; + res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x)); + x += kStep; + res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x)); + svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + svuint8_t result = + svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1)); + svst1(svptrue_b8(), p_dst, result); + }; + + auto vector_path_nearest_tail = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + + svuint32x2_t coords = calculate_nearest_coordinates(pg32, x); + svuint32_t xi = svget2(coords, 0); + svuint32_t yi = svget2(coords, 1); + + svuint32_t result; + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + result = get_pixels_or_border(pg32, xi, yi); + } else { + result = + load_xy(pg32, xi, yi, sv_src_stride, src_rows); + } + svst1b_u32(pg32, &dst[static_cast(x)], result); + }; + + auto calculate_linear = [&](svbool_t pg, uint32_t x) { + svfloat32x2_t coords = calc_coords(pg, x); + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + return calculate_linear_replicated_border( + pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + return calculate_linear_constant_border( + pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); + } + }; + + auto process_row = [&](size_t y) { + float fy = static_cast(y); + // Calculate half-transformed values at the first pixel (nominators) + // tw = T6*x + T7*y + T8 + // tx = (T0*x + T1*y + T2) / tw + // ty = (T3*x + T4*y + T5) / tw + tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2])); + ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5])); + tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8])); + + Columns dst = dst_rows.as_columns(); + LoopUnroll2 loop{dst_width, kStep}; + if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { + loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); }); + loop.unroll_once( + [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); }); + loop.remaining([&](size_t x, size_t length) { + vector_path_nearest_tail(x, length, dst); + }); + } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { + loop.unroll_four_times([&](size_t x) { + ScalarType *p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(pg_all32, x); + x += kStep; + svuint32_t res1 = calculate_linear(pg_all32, x); + svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + x += kStep; + res0 = calculate_linear(pg_all32, x); + x += kStep; + res1 = calculate_linear(pg_all32, x); + svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + svst1_u8(svptrue_b8(), p_dst, + svuzp1_u8(svreinterpret_u8_u16(result16_0), + svreinterpret_u8_u16(result16_1))); + }); + loop.unroll_once([&](size_t x) { + ScalarType *p_dst = &dst[static_cast(x)]; + svuint32_t result = calculate_linear(pg_all32, x); + svst1b_u32(pg_all32, p_dst, result); + }); + loop.remaining([&](size_t x, size_t x_max) { + ScalarType *p_dst = &dst[static_cast(x)]; + svbool_t pg32 = svwhilelt_b32(x, x_max); + svuint32_t result = calculate_linear(pg32, x); + svst1b_u32(pg32, p_dst, result); + }); + } else { + static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || + Inter == KLEIDICV_INTERPOLATION_LINEAR, + ": Unknown interpolation type!"); + } + }; + + for (size_t y = y_begin; y < y_end; ++y) { + process_row(y); + ++dst_rows; + } +} + template -kleidicv_error_t warp_perspective_stripe( +KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe( const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, - size_t y_begin, size_t y_end, const float transformation[9], - size_t channels, kleidicv_interpolation_type_t interpolation, + size_t y_begin, size_t y_end, const float transform[9], size_t channels, + kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const T *border_value) { - return warp_perspective_stripe_sc( - src, src_stride, src_width, src_height, dst, dst_stride, dst_width, - dst_height, y_begin, y_end, transformation, channels, interpolation, - border_type, border_value); + CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); + CHECK_POINTERS(transform); + CHECK_IMAGE_SIZE(src_width, src_height); + CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } + + // Calculating in float32_t will only be precise until 24 bits, and + // multiplication can only be done with 32x32 bits + // Empty source image is not supported + if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || + dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || + src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) { + return KLEIDICV_ERROR_RANGE; + } + + Rows src_rows{src, src_stride, channels}; + Rows dst_rows{dst, dst_stride, channels}; + Rectangle rect{dst_width, dst_height}; + + dst_rows += y_begin; + + transform_operation(is_image_large(src_rows, src_height), interpolation, + border_type, src_rows, src_width, src_height, + border_value, dst_rows, dst_width, y_begin, y_end, + transform); + + return KLEIDICV_OK; } #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type) \ diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 7a18a0bcf..d3bde250c 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -599,8 +599,8 @@ class RemapS16Point5 : public testing::Test { EXPECT_EQ_ARRAY2D(actual, expected); } - static ScalarType lerp2d(size_t cx, size_t cy, ScalarType a, ScalarType b, - ScalarType c, ScalarType d) { + static ScalarType lerp_2d(size_t cx, size_t cy, ScalarType a, ScalarType b, + ScalarType c, ScalarType d) { size_t inv_cx = FRAC_MAX - cx, inv_cy = FRAC_MAX - cy; ScalarType r = static_cast((inv_cx * inv_cy * a + cx * inv_cy * b + inv_cx * cy * c + @@ -635,8 +635,8 @@ class RemapS16Point5 : public testing::Test { const int16_t *coords = mapxy.at(row, column * 2); int16_t x = coords[0], y = coords[1]; *expected.at(row, column * src.channels() + ch) = - lerp2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], - get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); + lerp_2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], + get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); } } } @@ -1269,8 +1269,8 @@ TYPED_TEST(RemapF32, ZeroHeightImage) { const size_t src_stride = kW * sizeof(TypeParam); const size_t big_stride = (1UL << 32UL) - sizeof(TypeParam); const size_t dst_stride = kW * sizeof(TypeParam); - float mapx[kW] = {}; - float mapy[kW] = {}; + float mapx[kW] = {-0.2, 0.3, 1.4, 2.5}; + float mapy[kW] = {-1.8, -0.7, 0.6, 1.3}; const size_t mapx_stride = kW * sizeof(float); const size_t mapy_stride = kW * sizeof(float); @@ -1288,7 +1288,7 @@ TYPED_TEST(RemapF32, ZeroHeightImage) { border_type, border_value)); } const TypeParam border_value[1] = {0}; - EXPECT_EQ(KLEIDICV_OK, + EXPECT_EQ(KLEIDICV_ERROR_RANGE, remap_f32()( src, src_stride, kW, 0, dst, dst_stride, kW, 1, 1, mapx, mapx_stride, mapy, mapy_stride, KLEIDICV_INTERPOLATION_LINEAR, diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp index db5004de5..7efc85ab9 100644 --- a/test/api/test_warp_perspective.cpp +++ b/test/api/test_warp_perspective.cpp @@ -724,8 +724,8 @@ class WarpPerspectiveLinear : public testing::Test { INT_MIN, std::min(floor(fy), KLEIDICV_MAX_IMAGE_PIXELS))); ptrdiff_t ix1 = ix0 + 1; ptrdiff_t iy1 = iy0 + 1; - double xfrac = std::isfinite(fx) ? fx - floor(fx) : 0.0; - double yfrac = std::isfinite(fy) ? fy - floor(fy) : 0.0; + double xfrac = fx - floor(fx); + double yfrac = fy - floor(fy); for (size_t ch = 0; ch < src.channels(); ++ch) { double a = get_src(ix0, iy0)[ch]; double b = get_src(ix1, iy0)[ch]; @@ -844,15 +844,16 @@ TYPED_TEST(WarpPerspectiveLinear, RandomTransform) { float transform[9]; // Not entirely random, as very small and very big floats (in absolute value) // cause too big errors and they are far from being valid use cases anyway - test::PseudoRandomNumberGeneratorIntRange exponentGenerator(-7, 7); - test::PseudoRandomNumberGeneratorFloatRange mantissaGenerator(-1.0, + test::PseudoRandomNumberGeneratorIntRange exponentGenerator(-7, 7); + test::PseudoRandomNumberGeneratorIntRange signGenerator(0, 1); + test::PseudoRandomNumberGeneratorFloatRange mantissaGenerator(0.01, 1.0); for (size_t cc = 0; cc < 100; ++cc) { for (size_t i = 0; i < 9; ++i) { transform[i] = - mantissaGenerator.next().value_or(1.0) * - static_cast( - exp(static_cast(exponentGenerator.next().value_or(1.0)))); + mantissaGenerator.next().value_or(1.0F) * + (2.0F * static_cast(signGenerator.next().value_or(0)) - 1.0F) * + expf(static_cast(exponentGenerator.next().value_or(1))); } size_t src_w = 3 * test::Options::vector_lanes() - 1; @@ -875,13 +876,26 @@ TYPED_TEST(WarpPerspectiveLinear, DivisionByZero) { }; // clang-format on - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; + size_t kW = 3 * test::Options::vector_lanes() - 1; + size_t kH = 2; + + test::Array2D source{kW, kH, 1, 1}; + test::Array2D dst{kW, kH, 1, 1}; + + for (int64_t y = 0; y < static_cast(source.height()); ++y) { + for (int64_t x = 0; x < static_cast(source.width()); ++x) { + const int64_t kMaxVal = std::numeric_limits::max() / 2; + *source.at(y, x) = + kMaxVal / 4 + abs((x + y) % (2 * kMaxVal + 1) - kMaxVal); + } + } + for (auto [border_type, border_value] : get_borders()) { - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, - border_type, border_value, 3); + EXPECT_EQ(KLEIDICV_OK, + kleidicv_warp_perspective_u8( + source.data(), source.stride(), kW, kH, dst.data(), + dst.stride(), kW, kH, transform_div_by_zero, 1, + KLEIDICV_INTERPOLATION_LINEAR, border_type, border_value)); } } -- GitLab