From 8dc49a2b81c112f6acc5dc0441652fbf3799f99f Mon Sep 17 00:00:00 2001 From: Michael Platings Date: Tue, 14 Jan 2025 15:49:21 +0000 Subject: [PATCH 1/2] Add support for constant border to Warp Perspective --- CHANGELOG.md | 2 +- doc/functionality.md | 8 +- doc/opencv.md | 2 +- kleidicv/include/kleidicv/kleidicv.h | 1 + .../kleidicv/transform/warp_perspective.h | 4 +- .../src/transform/warp_perspective_neon.cpp | 264 ++++++++++++++--- kleidicv/src/transform/warp_perspective_sc.h | 203 ++++++++++--- scripts/benchmark/benchmarks.txt | 4 + test/api/test_thread.cpp | 4 +- test/api/test_warp_perspective.cpp | 269 +++++++++++++----- 10 files changed, 595 insertions(+), 166 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac4a4bfff..8ad96463c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ This changelog aims to follow the guiding principles of - Nearest neighbour, for replicated borders with 1-channel u8 and u16 inputs. - Fixed-point interpolation, for replicated borders with 1-channel u8 input. - WarpPerspective implementation - - Nearest and Linear interpolation method, for replicated borders and 1-channel u8 input. + - Nearest and Linear interpolation method, for 1-channel u8 input. ### Changed - Increased precision of sum for 32 bit floats and expose it to OpenCV HAL. diff --git a/doc/functionality.md b/doc/functionality.md index f2bf16266..9cc895415 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -99,7 +99,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Remap int16+uint16 fixed-point coordinates | x | | # WarpPerspective -| | u8 | u16 | -|------------------------------------------|-----|-----| -| Nearest neighbour, replicated borders | x | | -| Linear interpolation, replicated borders | x | | +| | u8 | +|----------------------|-----| +| Nearest neighbour | x | +| Linear interpolation | x | diff --git a/doc/opencv.md b/doc/opencv.md index a6f1d2fb8..478712113 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -229,7 +229,7 @@ Notes on parameters: * `src.cols`, `src.rows`, `dst.cols`, `dst.rows` - must be less than 2^24 * `src.step` - must be less than 2^32 * `dst.cols` - must be at least 8 -* `borderMode` - only supports `BORDER_REPLICATE` +* `borderMode` - supports `BORDER_REPLICATE` and `BORDER_CONSTANT` * `interpolation` - supports `INTER_NEAREST` and `INTER_LINEAR` ### [`cv::pyrDown()`](https://docs.opencv.org/4.10.0/d4/d86/group__imgproc__filter.html#gaf9bba239dfca11654cb7f50f889fc2ff) diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 1133718ea..517cd41e5 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1912,6 +1912,7 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8( /// @param border_type Way of handling the border. The supported border types /// are: \n /// - @ref KLEIDICV_BORDER_TYPE_REPLICATE +/// - @ref KLEIDICV_BORDER_TYPE_CONSTANT /// @param border_value Border value if the border_type is /// @ref KLEIDICV_BORDER_TYPE_CONSTANT. kleidicv_error_t kleidicv_warp_perspective_u8( diff --git a/kleidicv/include/kleidicv/transform/warp_perspective.h b/kleidicv/include/kleidicv/transform/warp_perspective.h index 45b512764..cf7962afe 100644 --- a/kleidicv/include/kleidicv/transform/warp_perspective.h +++ b/kleidicv/include/kleidicv/transform/warp_perspective.h @@ -35,8 +35,8 @@ inline bool warp_perspective_is_implemented( return (dst_width >= 8 && (interpolation == KLEIDICV_INTERPOLATION_NEAREST || interpolation == KLEIDICV_INTERPOLATION_LINEAR) && - border_type == - kleidicv_border_type_t::KLEIDICV_BORDER_TYPE_REPLICATE && + (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || + border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && channels == 1); } else { return false; diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp index bba3f7e82..1f4c6dbf4 100644 --- a/kleidicv/src/transform/warp_perspective_neon.cpp +++ b/kleidicv/src/transform/warp_perspective_neon.cpp @@ -4,6 +4,8 @@ #include +#include + #include "kleidicv/ctypes.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" @@ -31,15 +33,153 @@ namespace kleidicv::neon { // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) // +namespace { + typedef struct { float32x4_t x, y; } CoordVectorPair; -template +template +inline uint32x4_t get_pixels_or_border_large(uint32x4_t x, uint32x4_t y, + uint32x4_t v_xmax, + uint32x4_t v_ymax, + uint32x2_t v_src_stride, + Rows src_rows, + const ScalarType *border_value) { + uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + uint64x2_t offsets_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride); + uint64x2_t offsets_high = + vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride); + // Copy pixels from source + uint32_t pixel_data[4] = { + vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u64(offsets_low, 0)] + : border_value[0], + vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u64(offsets_low, 1)] + : border_value[0], + vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u64(offsets_high, 0)] + : border_value[0], + vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u64(offsets_high, 1)] + : border_value[0], + }; + return vld1q_u32(pixel_data); +} + +template +inline uint32x4_t get_pixels_or_border_small(uint32x4_t x, uint32x4_t y, + uint32x4_t v_xmax, + uint32x4_t v_ymax, + uint32x4_t v_src_stride, + Rows src_rows, + const ScalarType *border_value) { + uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); + // Calculate offsets from coordinates (y * stride + x) + // Use this path only when the final offsets fit into 32 bits + uint32x4_t offsets = vmlaq_u32(x, y, v_src_stride); + // Copy pixels from source + uint32_t pixel_data[4] = { + vgetq_lane_u32(in_range, 0) ? src_rows[vgetq_lane_u32(offsets, 0)] + : border_value[0], + vgetq_lane_u32(in_range, 1) ? src_rows[vgetq_lane_u32(offsets, 1)] + : border_value[0], + vgetq_lane_u32(in_range, 2) ? src_rows[vgetq_lane_u32(offsets, 2)] + : border_value[0], + vgetq_lane_u32(in_range, 3) ? src_rows[vgetq_lane_u32(offsets, 3)] + : border_value[0], + }; + return vld1q_u32(pixel_data); +} + +template +inline void get_edge_pixels(unsigned &a_result, unsigned &b_result, + unsigned &c_result, unsigned &d_result, int x0, + int y0, ptrdiff_t offset, + Rows src_rows, int src_width, + int src_height) { + if (y0 >= 0) { + if (x0 >= 0) { + a_result = src_rows[offset]; + } + if (x0 + 1 < src_width) { + b_result = src_rows[offset + 1]; + } + } + if (y0 + 1 < src_height) { + offset += src_rows.stride(); + if (x0 >= 0) { + c_result = src_rows[offset]; + } + if (x0 + 1 < src_width) { + d_result = src_rows[offset + 1]; + } + } +} + +template +inline uint32x4_t calculate_linear_constant_border( + float32x4_t xf, float32x4_t yf, Rows src_rows, + int src_width, int src_height, const ScalarType *border_value) { + // Convert obviously out-of-range coordinates to values that are just beyond + // the largest permitted image width & height. This avoids the need for + // special case handling elsewhere. + float32x4_t big = vdupq_n_f32(1 << 24); + xf = vbslq_f32(vcleq_f32(vabsq_f32(xf), big), xf, big); + yf = vbslq_f32(vcleq_f32(vabsq_f32(yf), big), yf, big); + + int32x4_t x0 = vcvtmq_s32_f32(xf); + int32x4_t y0 = vcvtmq_s32_f32(yf); + int x0_array[4], y0_array[4]; + unsigned a_array[4], b_array[4], c_array[4], d_array[4]; + vst1q_s32(x0_array, x0); + vst1q_s32(y0_array, y0); + for (int i = 0; i < 4; ++i) { + int x0i = x0_array[i]; + int y0i = y0_array[i]; + ptrdiff_t offset = x0i + y0i * src_rows.stride(); + + if (x0i < 0 || x0i + 1 >= src_width || y0i < 0 || y0i + 1 >= src_height) { + // Not entirely within the source image + + a_array[i] = b_array[i] = c_array[i] = d_array[i] = border_value[0]; + + if (x0i < -1 || x0i >= src_width || y0i < -1 || y0i >= src_height) { + // Completely outside the source image + continue; + } + + get_edge_pixels(a_array[i], b_array[i], c_array[i], d_array[i], x0i, y0i, + offset, src_rows, src_width, src_height); + continue; + } + + // Completely inside the source image + a_array[i] = src_rows[offset]; + b_array[i] = src_rows[offset + 1]; + offset += src_rows.stride(); + c_array[i] = src_rows[offset]; + d_array[i] = src_rows[offset + 1]; + } + + float32x4_t xfrac = vsubq_f32(xf, vrndmq_f32(xf)); + float32x4_t yfrac = vsubq_f32(yf, vrndmq_f32(yf)); + float32x4_t a = vcvtq_f32_u32(vld1q_u32(a_array)); + float32x4_t b = vcvtq_f32_u32(vld1q_u32(b_array)); + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t c = vcvtq_f32_u32(vld1q_u32(c_array)); + float32x4_t d = vcvtq_f32_u32(vld1q_u32(d_array)); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vcvtaq_u32_f32(result); +} + +template void warp_perspective_operation(Rows src_rows, size_t src_width, size_t src_height, const float transform[9], + const ScalarType *border_value, Rows dst_rows, size_t dst_width, size_t y_begin, size_t y_end) { static constexpr uint32_t first_few_x[] = {0, 1, 2, 3}; @@ -51,7 +191,6 @@ void warp_perspective_operation(Rows src_rows, vdupq_n_u32(static_cast(src_rows.stride())); uint32x4_t v_xmax = vdupq_n_u32(static_cast(src_width - 1)); uint32x4_t v_ymax = vdupq_n_u32(static_cast(src_height - 1)); - float32x4_t tx0, ty0, tw0; auto calculate_coordinates = [&](uint32_t x) { @@ -106,6 +245,32 @@ void warp_perspective_operation(Rows src_rows, dst[ix + 3] = src_rows[vgetq_lane_u64(offsets_high, 1)]; }; + auto vector_path_nearest_constant_border = [&](uint32_t x, + Columns dst) { + auto &&[xf, yf] = calculate_coordinates(x); + // Convert coordinates to integers. + // Negative numbers will become large positive numbers. + // Since the source width and height is known to be <=2^24 these large + // positive numbers will always be treated as outside the source image + // bounds. + uint32x4_t xi = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf)); + uint32x4_t yi = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf)); + uint32x4_t pixels; + if constexpr (IsLarge) { + pixels = get_pixels_or_border_large(xi, yi, v_xmax, v_ymax, v_src_stride, + src_rows, border_value); + } else { + pixels = get_pixels_or_border_small(xi, yi, v_xmax, v_ymax, vq_src_stride, + src_rows, border_value); + } + + ptrdiff_t ix = static_cast(x); + dst[ix] = static_cast(vgetq_lane_u32(pixels, 0)); + dst[ix + 1] = static_cast(vgetq_lane_u32(pixels, 1)); + dst[ix + 2] = static_cast(vgetq_lane_u32(pixels, 2)); + dst[ix + 3] = static_cast(vgetq_lane_u32(pixels, 3)); + }; + auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride); uint64_t acc = @@ -134,7 +299,7 @@ void warp_perspective_operation(Rows src_rows, return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); }; - auto calculate_linear = [&](uint32_t x) { + auto calculate_linear_replicate = [&](uint32_t x) { auto load_floats = [&](uint32x4_t x, uint32x4_t y) { if constexpr (IsLarge) { return load_src_into_floats_large(x, y); @@ -174,6 +339,18 @@ void warp_perspective_operation(Rows src_rows, return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result)); }; + auto calculate_linear = [&](uint32_t x) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + return calculate_linear_replicate(x); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + auto &&[xf, yf] = calculate_coordinates(x); + return calculate_linear_constant_border( + xf, yf, src_rows, static_cast(src_width), + static_cast(src_height), border_value); + } + }; + auto process_row = [&](uint32_t y, Columns dst) { float dy = static_cast(y); // Calculate half-transformed values at the first pixel (nominators) @@ -187,7 +364,11 @@ void warp_perspective_operation(Rows src_rows, static const size_t kStep = VecTraits::num_lanes(); LoopUnroll2 loop{dst_width, kStep}; if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { - if constexpr (IsLarge) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + loop.unroll_once([&](size_t x) { + vector_path_nearest_constant_border(static_cast(x), dst); + }); + } else if constexpr (IsLarge) { loop.unroll_once([&](size_t x) { vector_path_nearest_large(static_cast(x), dst); }); @@ -232,6 +413,39 @@ void warp_perspective_operation(Rows src_rows, } } +// Convert border_type to a template argument. +template +void warp_perspective_operation(kleidicv_border_type_t border_type, + Args &&...args) { + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { + warp_perspective_operation( + std::forward(args)...); + } else { + warp_perspective_operation( + std::forward(args)...); + } +} + +// Convert interpolation_type to a template argument. +template +void warp_perspective_operation( + kleidicv_interpolation_type_t interpolation_type, Args &&...args) { + if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) { + warp_perspective_operation( + std::forward(args)...); + } else { + warp_perspective_operation( + std::forward(args)...); + } +} + +} // namespace + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -240,13 +454,17 @@ kleidicv_error_t warp_perspective_stripe( T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t y_begin, size_t y_end, const float transformation[9], size_t channels, kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t, const T *) { + kleidicv_border_type_t border_type, const T *border_value) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTERS(transformation); CHECK_IMAGE_SIZE(src_width, src_height); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } + // Calculating in float32_t will only be precise until 24 bits, and // multiplication can only be done with 32x32 bits if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || @@ -260,35 +478,13 @@ kleidicv_error_t warp_perspective_stripe( dst_rows += y_begin; if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - switch (interpolation) { - case KLEIDICV_INTERPOLATION_NEAREST: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - case KLEIDICV_INTERPOLATION_LINEAR: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - default: - break; // GCOVR_EXCL_LINE - } + warp_perspective_operation( + interpolation, border_type, src_rows, src_width, src_height, + transformation, border_value, dst_rows, dst_width, y_begin, y_end); } else { - switch (interpolation) { - case KLEIDICV_INTERPOLATION_NEAREST: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - case KLEIDICV_INTERPOLATION_LINEAR: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - default: - break; // GCOVR_EXCL_LINE - } + warp_perspective_operation( + interpolation, border_type, src_rows, src_width, src_height, + transformation, border_value, dst_rows, dst_width, y_begin, y_end); } // NOLINTEND(readability-function-cognitive-complexity) return KLEIDICV_OK; diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h index a790a6acf..519d544dd 100644 --- a/kleidicv/src/transform/warp_perspective_sc.h +++ b/kleidicv/src/transform/warp_perspective_sc.h @@ -4,7 +4,9 @@ #include +#include #include +#include #include "kleidicv/ctypes.h" #include "kleidicv/kleidicv.h" @@ -37,11 +39,12 @@ namespace KLEIDICV_TARGET_NAMESPACE { // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) // -template +template void warp_perspective_operation(Rows src_rows, size_t src_width, size_t src_height, const float transform[9], + const ScalarType *border_value, Rows dst_rows, size_t dst_width, size_t y_begin, size_t y_end) { svbool_t pg_all64 = svptrue_b64(); @@ -50,6 +53,13 @@ void warp_perspective_operation(Rows src_rows, svuint32_t sv_xmax = svdup_n_u32(src_width - 1); svuint32_t sv_ymax = svdup_n_u32(src_height - 1); svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); + svuint32_t sv_border; + // sv_border is only used if the border type is constant. + // If the border type is not constant then border_value is permitted to be + // null and must not be read. + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + sv_border = svdup_n_u32(border_value[0]); + } svfloat32_t T0 = svdup_n_f32(transform[0]); svfloat32_t T3 = svdup_n_f32(transform[3]); @@ -82,14 +92,23 @@ void warp_perspective_operation(Rows src_rows, auto calculate_nearest_coordinates = [&](size_t x) { calculate_coordinates(x); - // Round to the nearest integer, clamp it to within the dimensions of the - // source image (negative values are already saturated to 0) - xi = svmin_x(pg_all32, - svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), - sv_xmax); - yi = svmin_x(pg_all32, - svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), - sv_ymax); + + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + // Round to the nearest integer + xi = svreinterpret_u32_s32( + svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf))); + yi = svreinterpret_u32_s32( + svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf))); + } else { + // Round to the nearest integer, clamp it to within the dimensions of the + // source image (negative values are already saturated to 0) + xi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), + sv_xmax); + yi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), + sv_ymax); + } }; auto load_small = [&](svbool_t pg, svuint32_t x, svuint32_t y) { @@ -112,9 +131,28 @@ void warp_perspective_operation(Rows src_rows, svreinterpret_u32_u64(result_t)); }; + auto get_pixels_or_border = [&](svuint32_t x, svuint32_t y) { + svbool_t in_range = svand_b_z(pg32, svcmple_u32(pg32, x, sv_xmax), + svcmple_u32(pg32, y, sv_ymax)); + + svuint32_t result; + if constexpr (IsLarge) { + svbool_t pg_b = in_range; + svbool_t pg_t = svtrn2_b32(in_range, svpfalse()); + result = load_large(pg_b, pg_t, x, y); + } else { + result = load_small(in_range, x, y); + } + + // Select between source pixels and border colour + return svsel_u32(in_range, result, sv_border); + }; + auto vector_path_nearest_4x = [&](size_t x, Columns dst) { auto load_source = [&](svuint32_t x, svuint32_t y) { - if constexpr (IsLarge) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + return get_pixels_or_border(x, y); + } else if constexpr (IsLarge) { return load_large(pg_all64, pg_all64, x, y); } else { return load_small(pg_all32, x, y); @@ -144,16 +182,22 @@ void warp_perspective_operation(Rows src_rows, auto vector_path_nearest_tail = [&](size_t x, size_t x_max, Columns dst) { size_t length = x_max - x; - svbool_t pg_b = svwhilelt_b64(0ULL, (length + 1) / 2); - svbool_t pg_t = svwhilelt_b64(0ULL, length / 2); - svbool_t pg = svwhilelt_b32(0ULL, length); - // To avoid losing precision, the final indices use 64 bits + pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2); + pg64_t = svwhilelt_b64(0ULL, length / 2); + pg32 = svwhilelt_b32(0ULL, length); calculate_nearest_coordinates(x); - svuint32_t result = load_large(pg_b, pg_t, xi, yi); - svst1b_u32(pg, &dst[static_cast(x)], result); + svuint32_t result; + + if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { + result = get_pixels_or_border(xi, yi); + } else { + // To avoid losing precision, the final indices use 64 bits + result = load_large(pg64_b, pg64_t, xi, yi); + } + svst1b_u32(pg32, &dst[static_cast(x)], result); }; - auto calculate_linear = [&](size_t x) { + auto calculate_linear_replicate = [&](uint32_t x) { auto load_source = [&](svuint32_t x, svuint32_t y) { if constexpr (IsLarge) { return load_large(pg64_b, pg64_t, x, y); @@ -201,6 +245,59 @@ void warp_perspective_operation(Rows src_rows, svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F))); }; + auto calculate_linear_constant_border = [&](uint32_t x) { + calculate_coordinates(x); + + // Convert obviously out-of-range coordinates to values that are just beyond + // the largest permitted image width & height. This avoids the need for + // special case handling elsewhere. + svfloat32_t big = svdup_n_f32(1 << 24); + xf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, xf), big), xf, + big); + yf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, yf), big), yf, + big); + + svfloat32_t xf0 = svrintm_f32_x(pg_all32, xf); + svfloat32_t yf0 = svrintm_f32_x(pg_all32, yf); + + svint32_t x0 = svcvt_s32_x(pg_all32, xf0); + svint32_t y0 = svcvt_s32_x(pg_all32, yf0); + svint32_t x1 = svadd_s32_x(pg_all32, x0, svdup_n_s32(1)); + svint32_t y1 = svadd_s32_x(pg_all32, y0, svdup_n_s32(1)); + + svfloat32_t xfrac = svsub_f32_x(pg_all32, xf, xf0); + svfloat32_t yfrac = svsub_f32_x(pg_all32, yf, yf0); + + svfloat32_t a = svcvt_f32_u32_x( + pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0), + svreinterpret_u32_s32(y0))); + svfloat32_t b = svcvt_f32_u32_x( + pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1), + svreinterpret_u32_s32(y0))); + svfloat32_t line0 = + svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); + svfloat32_t c = svcvt_f32_u32_x( + pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0), + svreinterpret_u32_s32(y1))); + svfloat32_t d = svcvt_f32_u32_x( + pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1), + svreinterpret_u32_s32(y1))); + svfloat32_t line1 = + svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); + svfloat32_t result = svmla_f32_x( + pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); + return svcvt_u32_f32_x(pg_all32, svrinta_f32_x(pg_all32, result)); + }; + + auto calculate_linear = [&](uint32_t x) { + if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { + return calculate_linear_replicate(x); + } else { + static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); + return calculate_linear_constant_border(x); + } + }; + auto process_row = [&](size_t y, Columns dst) { float fy = static_cast(y); // Calculate half-transformed values at the first pixel (nominators) @@ -269,6 +366,37 @@ void warp_perspective_operation(Rows src_rows, } } +// Convert border_type to a template argument. +template +void warp_perspective_operation(kleidicv_border_type_t border_type, + Args &&...args) { + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { + warp_perspective_operation( + std::forward(args)...); + } else { + warp_perspective_operation( + std::forward(args)...); + } +} + +// Convert interpolation_type to a template argument. +template +void warp_perspective_operation( + kleidicv_interpolation_type_t interpolation_type, Args &&...args) { + if (interpolation_type == KLEIDICV_INTERPOLATION_NEAREST) { + warp_perspective_operation( + std::forward(args)...); + } else { + warp_perspective_operation( + std::forward(args)...); + } +} + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -277,12 +405,15 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc( T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t y_begin, size_t y_end, const float transformation[9], size_t channels, kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t, const T *) { + kleidicv_border_type_t border_type, const T *border_value) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTERS(transformation); CHECK_IMAGE_SIZE(src_width, src_height); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { + return KLEIDICV_ERROR_NULL_POINTER; + } // Calculating in float32_t will only be precise until 24 bits, and // multiplication can only be done with 32x32 bits @@ -299,35 +430,13 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc( dst_rows += y_begin; if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { - switch (interpolation) { - case KLEIDICV_INTERPOLATION_NEAREST: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - case KLEIDICV_INTERPOLATION_LINEAR: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - default: - break; // GCOVR_EXCL_LINE - } + warp_perspective_operation( + interpolation, border_type, src_rows, src_width, src_height, + transformation, border_value, dst_rows, dst_width, y_begin, y_end); } else { - switch (interpolation) { - case KLEIDICV_INTERPOLATION_NEAREST: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - case KLEIDICV_INTERPOLATION_LINEAR: - warp_perspective_operation( - src_rows, src_width, src_height, transformation, dst_rows, - dst_width, y_begin, y_end); - break; - default: - break; // GCOVR_EXCL_LINE - } + warp_perspective_operation( + interpolation, border_type, src_rows, src_width, src_height, + transformation, border_value, dst_rows, dst_width, y_begin, y_end); } return KLEIDICV_OK; diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 601720b43..4eb782d1c 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -85,6 +85,10 @@ WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMA WarpPerspectiveNear_Nearest: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)' WarpPerspectiveNear_Linear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)' +WarpPerspective_Nearest_Constant: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_CONSTANT, 8UC1)' +WarpPerspectiveNear_Nearest_Constant: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_CONSTANT, 8UC1)' +WarpPerspective_Linear_Constant: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_CONSTANT, 8UC1)' +WarpPerspectiveNear_Linear_Constant: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_CONSTANT, 8UC1)' BlurAndDownsample: opencv_perf_imgproc '*pyrDown/*' '($PIXEL_FORMAT, 8UC1)' diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 862f0229d..27f6a6ba4 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -666,13 +666,13 @@ TEST_P(Thread, warp_perspective_u8_not_implemented) { KLEIDICV_BORDER_TYPE_REPLICATE, border_value); check_warp_perspective_not_implemented( kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_CONSTANT, border_value); + KLEIDICV_BORDER_TYPE_REFLECT, border_value); check_warp_perspective_not_implemented( kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, border_value); check_warp_perspective_not_implemented( kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_CONSTANT, border_value); + KLEIDICV_BORDER_TYPE_REFLECT, border_value); } TEST_P(Thread, SobelHorizontal1Channel) { diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp index 49cd9a674..482598002 100644 --- a/test/api/test_warp_perspective.cpp +++ b/test/api/test_warp_perspective.cpp @@ -4,6 +4,7 @@ #include +#include #include #include @@ -53,13 +54,43 @@ static void sequential_initializer(test::Array2D &source) { } } +template +static const ScalarType *get_array2d_element_or_border( + const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, + kleidicv_border_type_t border_type, const ScalarType *border_value) { + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { + x = std::clamp(x, 0, static_cast(src.width()) - 1); + y = std::clamp(y, 0, static_cast(src.height()) - 1); + } else { + assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); + if (x >= static_cast(src.width()) || + y >= static_cast(src.height()) || x < 0 || y < 0) { + return border_value; + } + } + return src.at(y, x * src.channels()); +} + +template +static const auto &get_borders() { + using P = std::pair; + static const T border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {4, 3, 2, 1}; + static const std::array borders{ + P{KLEIDICV_BORDER_TYPE_REPLICATE, nullptr}, + P{KLEIDICV_BORDER_TYPE_REPLICATE, border_value}, + P{KLEIDICV_BORDER_TYPE_CONSTANT, border_value}, + }; + return borders; +} + template class WarpPerspectiveNearest : public testing::Test { public: static void test_stripe(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, size_t y_begin, size_t y_end, const float transform[9], size_t channels, - size_t padding, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding, void (*initializer)(test::Array2D &) = sequential_initializer) { size_t src_total_width = channels * src_w; @@ -77,7 +108,8 @@ class WarpPerspectiveNearest : public testing::Test { } } - calculate_expected(source, y_begin, y_end, transform, expected); + calculate_expected(source, y_begin, y_end, transform, border_type, + border_value, expected); ASSERT_EQ( KLEIDICV_OK, @@ -85,7 +117,7 @@ class WarpPerspectiveNearest : public testing::Test { source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, {})); + border_type, border_value)); ScalarType threshold = 1; for (size_t row = y_begin; row < y_end; ++row) { @@ -114,16 +146,19 @@ class WarpPerspectiveNearest : public testing::Test { } static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - const float transform[9], size_t channels, size_t padding, + const float transform[9], size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding, void (*initializer)(test::Array2D &) = sequential_initializer) { test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels, - padding, initializer); + border_type, border_value, padding, initializer); } static void test_special_source( size_t src_w, size_t src_h, size_t src_stride, size_t dst_w, size_t dst_h, const float transform[9], size_t channels, + kleidicv_border_type_t border_type, const ScalarType *border_value, void (*initializer)(test::Array2D &) = sequential_initializer) { size_t src_total_width = channels * src_w; @@ -142,14 +177,15 @@ class WarpPerspectiveNearest : public testing::Test { actual.fill(42); - calculate_expected(source, 0, expected.height(), transform, expected); + calculate_expected(source, 0, expected.height(), transform, border_type, + border_value, expected); ASSERT_EQ(KLEIDICV_OK, kleidicv_warp_perspective_u8( source.data(), src_stride, src_w, src_h, actual.data(), actual.stride(), actual.width(), actual.height(), transform, - channels, KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, {})); + channels, KLEIDICV_INTERPOLATION_NEAREST, border_type, + border_value)); EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } @@ -157,7 +193,14 @@ class WarpPerspectiveNearest : public testing::Test { private: static void calculate_expected(test::Array2D &src, size_t y_begin, size_t y_end, const float transform[9], + kleidicv_border_type_t border_type, + const ScalarType *border_value, test::Array2D &expected) { + auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { + return get_array2d_element_or_border(src, x, y, border_type, + border_value); + }; + for (size_t y = y_begin; y < y_end; ++y) { for (size_t x = 0; x < expected.width() / src.channels(); ++x) { float dx = static_cast(x), dy = static_cast(y); @@ -168,13 +211,14 @@ class WarpPerspectiveNearest : public testing::Test { inv_w * (transform[0] * dx + transform[1] * dy + transform[2]); float fy = inv_w * (transform[3] * dx + transform[4] * dy + transform[5]); - ptrdiff_t ix = static_cast( - std::max(0, std::min(fx + 0.5, src.width() - 1.0))); - ptrdiff_t iy = static_cast( - std::max(0, std::min(fy + 0.5, src.height() - 1.0))); + ptrdiff_t ix = static_cast(std::max( + INT_MIN, + std::min(std::round(fx), KLEIDICV_MAX_IMAGE_PIXELS))); + ptrdiff_t iy = static_cast(std::max( + INT_MIN, + std::min(std::round(fy), KLEIDICV_MAX_IMAGE_PIXELS))); for (size_t ch = 0; ch < src.channels(); ++ch) { - *expected.at(y, x * src.channels() + ch) = - *src.at(iy, ix * src.channels() + ch); + *expected.at(y, x * src.channels() + ch) = get_src(ix, iy)[ch]; } } } @@ -189,7 +233,10 @@ TYPED_TEST(WarpPerspectiveNearest, IdentityNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, + border_type, border_value, 0); + } } TYPED_TEST(WarpPerspectiveNearest, TransposeNoPadding) { @@ -197,7 +244,10 @@ TYPED_TEST(WarpPerspectiveNearest, TransposeNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, + border_type, border_value, 0); + } } TYPED_TEST(WarpPerspectiveNearest, SmallPadding) { @@ -205,7 +255,10 @@ TYPED_TEST(WarpPerspectiveNearest, SmallPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, + border_type, border_value, 13); + } } TYPED_TEST(WarpPerspectiveNearest, Upscale) { @@ -221,7 +274,10 @@ TYPED_TEST(WarpPerspectiveNearest, Upscale) { size_t src_h = 4; size_t dst_w = src_w * 3; size_t dst_h = src_h * 2; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, + border_type, border_value, 3); + } } TYPED_TEST(WarpPerspectiveNearest, RandomTransform) { @@ -235,7 +291,10 @@ TYPED_TEST(WarpPerspectiveNearest, RandomTransform) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, border_type, + border_value, 19); + } } TYPED_TEST(WarpPerspectiveNearest, DivisionByZero) { @@ -251,7 +310,10 @@ TYPED_TEST(WarpPerspectiveNearest, DivisionByZero) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, + border_type, border_value, 3); + } } static const size_t kBigWidth = 1ULL << 17, kBigHeight = 1ULL << 17; @@ -270,8 +332,8 @@ static void part_initializer(test::Array2D &source) { TYPED_TEST(WarpPerspectiveNearest, BigSourceImage) { // clang-format off const float transform_cut[] = { - 1, 0, kBigWidth, - 0, 1, kBigHeight, + 1, 0, kBigWidth + kPartWidth * 0.5F, + 0, 1, kBigHeight + kPartHeight * 0.5F, 0, 0, 1 }; // clang-format on @@ -281,9 +343,10 @@ TYPED_TEST(WarpPerspectiveNearest, BigSourceImage) { size_t dst_h = kPartHeight; size_t src_w = kBigWidth + kPartWidth; size_t src_h = kBigHeight + kPartHeight; - - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0, - part_initializer); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, border_type, + border_value, 0, part_initializer); + } } TYPED_TEST(WarpPerspectiveNearest, BigWidthDestination) { @@ -299,7 +362,10 @@ TYPED_TEST(WarpPerspectiveNearest, BigWidthDestination) { size_t dst_h = 1; size_t src_w = dst_w / 1000; size_t src_h = dst_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, + border_type, border_value, 3); + } } static const size_t kHugeHeight = (1ULL << 24) - 1; @@ -316,8 +382,11 @@ TYPED_TEST(WarpPerspectiveNearest, BigHeightDestination) { size_t dst_h = kHugeHeight; size_t src_w = dst_w; size_t src_h = dst_h / 10000; - TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, - dst_h, transform_upscale, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, + dst_h, transform_upscale, 1, border_type, + border_value, 0); + } } template @@ -343,9 +412,11 @@ TYPED_TEST(WarpPerspectiveNearest, HugeHeightSourceAndDestination) { size_t dst_h = (1ULL << 24) - 1; size_t src_w = dst_w; size_t src_h = dst_h; - TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - 16, dst_h, - transform, 1, 0, - huge_height_part_initializer); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - 16, dst_h, + transform, 1, border_type, border_value, 0, + huge_height_part_initializer); + } } static const size_t oneline_big_width = 1ULL << 23; @@ -375,11 +446,15 @@ TYPED_TEST(WarpPerspectiveNearest, OneLineBigSourceImage) { size_t src_w = oneline_big_width + oneline_part_width; size_t src_h = 1; - TestFixture::test_special_source(src_w, src_h, 0, dst_w, dst_h, tr, 1, - oneline_part_initializer); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_special_source(src_w, src_h, 0, dst_w, dst_h, tr, 1, + border_type, border_value, + oneline_part_initializer); - TestFixture::test_special_source(src_w, src_h, (1ULL << 32) - 1, dst_w, dst_h, - tr, 1, oneline_part_initializer); + TestFixture::test_special_source(src_w, src_h, (1ULL << 32) - 1, dst_w, + dst_h, tr, 1, border_type, border_value, + oneline_part_initializer); + } } TYPED_TEST(WarpPerspectiveNearest, OneLineSmallSourceImage) { @@ -396,15 +471,19 @@ TYPED_TEST(WarpPerspectiveNearest, OneLineSmallSourceImage) { size_t src_w = 2; size_t src_h = 1; - TestFixture::test_special_source(src_w, src_h, src_h, dst_w, dst_h, tr, 1); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_special_source(src_w, src_h, src_h, dst_w, dst_h, tr, 1, + border_type, border_value); + } } TYPED_TEST(WarpPerspectiveNearest, NullPointer) { const TypeParam src[4] = {}; + const TypeParam border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {}; TypeParam dst[8]; test::test_null_args(kleidicv_warp_perspective_u8, src, 2, 2, 2, dst, 8, 8, 1, transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); + KLEIDICV_BORDER_TYPE_CONSTANT, border_value); } TYPED_TEST(WarpPerspectiveNearest, ZeroImageSize) { @@ -465,15 +544,16 @@ TYPED_TEST(WarpPerspectiveNearest, UnsupportedTwoChannels) { nullptr)); } -TYPED_TEST(WarpPerspectiveNearest, UnsupportedBorderTypeConst) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBorderType) { const TypeParam src[1] = {}; + const TypeParam border_value[KLEIDICV_MAXIMUM_CHANNEL_COUNT] = {}; TypeParam dst[8]; EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, kleidicv_warp_perspective_u8( src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 1, - KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_CONSTANT, - nullptr)); + KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REFLECT, + border_value)); } TYPED_TEST(WarpPerspectiveNearest, UnsupportedTooSmallImage) { @@ -552,7 +632,8 @@ class WarpPerspectiveLinear : public testing::Test { static void test_stripe(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, size_t y_begin, size_t y_end, const float transform[9], size_t channels, - size_t padding, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding, void (*initializer)(test::Array2D &) = sequential_initializer) { size_t src_total_width = channels * src_w; @@ -570,7 +651,8 @@ class WarpPerspectiveLinear : public testing::Test { } } - calculate_expected(source, y_begin, y_end, transform, expected); + calculate_expected(source, y_begin, y_end, transform, border_type, + border_value, expected); ASSERT_EQ( KLEIDICV_OK, @@ -578,7 +660,7 @@ class WarpPerspectiveLinear : public testing::Test { source.data(), source.stride(), source.width(), source.height(), actual.data(), actual.stride(), actual.width(), actual.height(), y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, {})); + border_type, border_value)); ScalarType threshold = 1; for (size_t row = y_begin; row < y_end; ++row) { @@ -607,17 +689,26 @@ class WarpPerspectiveLinear : public testing::Test { } static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - const float transform[9], size_t channels, size_t padding, + const float transform[9], size_t channels, + kleidicv_border_type_t border_type, + const ScalarType *border_value, size_t padding, void (*initializer)(test::Array2D &) = sequential_initializer) { test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels, - padding, initializer); + border_type, border_value, padding, initializer); } private: static void calculate_expected(test::Array2D &src, size_t y_begin, size_t y_end, const float transform[9], + kleidicv_border_type_t border_type, + const ScalarType *border_value, test::Array2D &expected) { + auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { + return get_array2d_element_or_border(src, x, y, border_type, + border_value); + }; + for (size_t y = y_begin; y < y_end; ++y) { for (size_t x = 0; x < expected.width() / src.channels(); ++x) { double dx = static_cast(x), dy = static_cast(y); @@ -627,21 +718,19 @@ class WarpPerspectiveLinear : public testing::Test { double ty = transform[3] * dx + transform[4] * dy + transform[5]; double fx = inv_w * tx; double fy = inv_w * ty; - uint32_t ix0 = static_cast( - std::max(0.0, std::min(fx, src.width() - 1.0))); - uint32_t iy0 = static_cast( - std::max(0.0, std::min(fy, src.height() - 1.0))); - uint32_t ix1 = (ix0 < src.width() - 1 && fx >= 0) ? ix0 + 1 : ix0; - uint32_t iy1 = (iy0 < src.height() - 1 && fy >= 0) ? iy0 + 1 : iy0; - double xfrac = - (ix0 < src.width() - 1 && fx >= 0) ? fx - floor(fx) : 0.0; - double yfrac = - (iy0 < src.height() - 1 && fy >= 0) ? fy - floor(fy) : 0.0; + ptrdiff_t ix0 = static_cast(std::max( + INT_MIN, std::min(floor(fx), KLEIDICV_MAX_IMAGE_PIXELS))); + ptrdiff_t iy0 = static_cast(std::max( + INT_MIN, std::min(floor(fy), KLEIDICV_MAX_IMAGE_PIXELS))); + ptrdiff_t ix1 = ix0 + 1; + ptrdiff_t iy1 = iy0 + 1; + double xfrac = std::isfinite(fx) ? fx - floor(fx) : 0.0; + double yfrac = std::isfinite(fy) ? fy - floor(fy) : 0.0; for (size_t ch = 0; ch < src.channels(); ++ch) { - double a = *src.at(iy0, ix0 * src.channels() + ch); - double b = *src.at(iy0, ix1 * src.channels() + ch); - double c = *src.at(iy1, ix0 * src.channels() + ch); - double d = *src.at(iy1, ix1 * src.channels() + ch); + double a = get_src(ix0, iy0)[ch]; + double b = get_src(ix1, iy0)[ch]; + double c = get_src(ix0, iy1)[ch]; + double d = get_src(ix1, iy1)[ch]; double line1 = (b - a) * xfrac + a; double line2 = (d - c) * xfrac + c; double double_result = (line2 - line1) * yfrac + line1; @@ -704,7 +793,10 @@ TYPED_TEST(WarpPerspectiveLinear, IdentityNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, + border_type, border_value, 0); + } } TYPED_TEST(WarpPerspectiveLinear, TransposeNoPadding) { @@ -712,7 +804,10 @@ TYPED_TEST(WarpPerspectiveLinear, TransposeNoPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, + border_type, border_value, 0); + } } TYPED_TEST(WarpPerspectiveLinear, SmallPadding) { @@ -720,7 +815,10 @@ TYPED_TEST(WarpPerspectiveLinear, SmallPadding) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, + border_type, border_value, 13); + } } TYPED_TEST(WarpPerspectiveLinear, Upscale) { @@ -736,7 +834,10 @@ TYPED_TEST(WarpPerspectiveLinear, Upscale) { size_t src_h = 4; size_t dst_w = src_w * 3; size_t dst_h = src_h * 2; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, + border_type, border_value, 3); + } } TYPED_TEST(WarpPerspectiveLinear, RandomTransform) { @@ -758,7 +859,10 @@ TYPED_TEST(WarpPerspectiveLinear, RandomTransform) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, border_type, + border_value, 19); + } } } @@ -775,7 +879,10 @@ TYPED_TEST(WarpPerspectiveLinear, DivisionByZero) { size_t src_h = 4; size_t dst_w = src_w; size_t dst_h = src_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, + border_type, border_value, 3); + } } template @@ -801,8 +908,10 @@ TYPED_TEST(WarpPerspectiveLinear, RoughSource) { size_t src_h = 634; size_t dst_w = 17; size_t dst_h = 852; - TestFixture::test(src_w, src_h, dst_w, dst_h, transformation, 1, 13, - crisscross_initializer); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transformation, 1, + border_type, border_value, 13, crisscross_initializer); + } } TYPED_TEST(WarpPerspectiveLinear, BigSourceImage) { @@ -820,8 +929,10 @@ TYPED_TEST(WarpPerspectiveLinear, BigSourceImage) { size_t src_w = kBigWidth + kPartWidth + 3; size_t src_h = kBigHeight + kPartHeight; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0, - part_initializer); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, border_type, + border_value, 0, part_initializer); + } } TYPED_TEST(WarpPerspectiveLinear, BigWidthDestination) { @@ -837,7 +948,10 @@ TYPED_TEST(WarpPerspectiveLinear, BigWidthDestination) { size_t dst_h = 1; size_t src_w = dst_w / 1000; size_t src_h = dst_h; - TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, + border_type, border_value, 3); + } } TYPED_TEST(WarpPerspectiveLinear, BigHeightDestination) { @@ -853,8 +967,11 @@ TYPED_TEST(WarpPerspectiveLinear, BigHeightDestination) { size_t dst_h = kHugeHeight; size_t src_w = dst_w; size_t src_h = dst_h / 10000; - TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, - dst_h, transform_upscale, 1, 0); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, + dst_h, transform_upscale, 1, border_type, + border_value, 0); + } } TYPED_TEST(WarpPerspectiveLinear, HugeHeightSourceAndDestination) { @@ -870,7 +987,9 @@ TYPED_TEST(WarpPerspectiveLinear, HugeHeightSourceAndDestination) { size_t dst_h = (1ULL << 24) - 1; size_t src_w = dst_w; size_t src_h = dst_h; - TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, - dst_h, transform, 1, 0, - huge_height_part_initializer); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, + dst_h, transform, 1, border_type, border_value, 0, + huge_height_part_initializer); + } } -- GitLab From dba6435748eceff28a78e7855748a399c187b5e9 Mon Sep 17 00:00:00 2001 From: Michael Platings Date: Tue, 14 Jan 2025 15:50:05 +0000 Subject: [PATCH 2/2] Refactor warp_perspective_sc.h The use of mutable variables across many lambdas had the effect of action-at-a-distance [1]. [1] https://en.wikipedia.org/wiki/Action_at_a_distance_(computer_programming) --- kleidicv/src/transform/warp_perspective_sc.h | 217 ++++++++----------- 1 file changed, 96 insertions(+), 121 deletions(-) diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h index 519d544dd..20cda8d79 100644 --- a/kleidicv/src/transform/warp_perspective_sc.h +++ b/kleidicv/src/transform/warp_perspective_sc.h @@ -47,7 +47,6 @@ void warp_perspective_operation(Rows src_rows, const ScalarType *border_value, Rows dst_rows, size_t dst_width, size_t y_begin, size_t y_end) { - svbool_t pg_all64 = svptrue_b64(); svbool_t pg_all32 = svptrue_b32(); svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1)); svuint32_t sv_xmax = svdup_n_u32(src_width - 1); @@ -65,13 +64,8 @@ void warp_perspective_operation(Rows src_rows, svfloat32_t T3 = svdup_n_f32(transform[3]); svfloat32_t T6 = svdup_n_f32(transform[6]); svfloat32_t tx0, ty0, tw0; - svfloat32_t xf, yf; svfloat32_t xmaxf = svdup_n_f32(static_cast(src_width - 1)); svfloat32_t ymaxf = svdup_n_f32(static_cast(src_height - 1)); - svuint32_t xi, yi; - - svbool_t pg32 = pg_all32; - svbool_t pg64_b = pg_all64, pg64_t = pg_all64; const size_t kStep = VecTraits::num_lanes(); @@ -86,13 +80,16 @@ void warp_perspective_operation(Rows src_rows, svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3); // Calculate coordinates into the source image - xf = svmul_f32_x(pg_all32, tx, iw); - yf = svmul_f32_x(pg_all32, ty, iw); + return svcreate2(svmul_f32_x(pg_all32, tx, iw), + svmul_f32_x(pg_all32, ty, iw)); }; auto calculate_nearest_coordinates = [&](size_t x) { - calculate_coordinates(x); + svfloat32x2_t coords = calculate_coordinates(x); + svfloat32_t xf = svget2(coords, 0); + svfloat32_t yf = svget2(coords, 1); + svuint32_t xi, yi; if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { // Round to the nearest integer xi = svreinterpret_u32_s32( @@ -109,69 +106,59 @@ void warp_perspective_operation(Rows src_rows, svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), sv_ymax); } + return svcreate2(xi, yi); }; - auto load_small = [&](svbool_t pg, svuint32_t x, svuint32_t y) { - svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); - return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); - }; - - auto load_large = [&](svbool_t pg_b, svbool_t pg_t, svuint32_t x, - svuint32_t y) { - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final offsets should be in 64 bits - svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); - svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); - // Copy pixels from source - svuint64_t result_b = - svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); - svuint64_t result_t = - svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); - return svtrn1_u32(svreinterpret_u32_u64(result_b), - svreinterpret_u32_u64(result_t)); - }; - - auto get_pixels_or_border = [&](svuint32_t x, svuint32_t y) { - svbool_t in_range = svand_b_z(pg32, svcmple_u32(pg32, x, sv_xmax), - svcmple_u32(pg32, y, sv_ymax)); - - svuint32_t result; + auto load = [&](svbool_t pg, svuint32_t x, svuint32_t y) { if constexpr (IsLarge) { - svbool_t pg_b = in_range; - svbool_t pg_t = svtrn2_b32(in_range, svpfalse()); - result = load_large(pg_b, pg_t, x, y); + svbool_t pg_b = pg; + svbool_t pg_t = svtrn2_b32(pg, svpfalse()); + + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b = + svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + svuint64_t result_t = + svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); } else { - result = load_small(in_range, x, y); + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); + return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); } + }; + auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) { + svbool_t in_range = + svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); + svuint32_t result = load(in_range, x, y); // Select between source pixels and border colour return svsel_u32(in_range, result, sv_border); }; auto vector_path_nearest_4x = [&](size_t x, Columns dst) { - auto load_source = [&](svuint32_t x, svuint32_t y) { + auto load_source = [&](svuint32x2_t coords) { + svuint32_t x = svget2(coords, 0); + svuint32_t y = svget2(coords, 1); if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - return get_pixels_or_border(x, y); - } else if constexpr (IsLarge) { - return load_large(pg_all64, pg_all64, x, y); + return get_pixels_or_border(pg_all32, x, y); } else { - return load_small(pg_all32, x, y); + return load(pg_all32, x, y); } }; ScalarType *p_dst = &dst[static_cast(x)]; - calculate_nearest_coordinates(x); - svuint32_t res32_0 = load_source(xi, yi); + svuint32_t res32_0 = load_source(calculate_nearest_coordinates(x)); x += kStep; - calculate_nearest_coordinates(x); - svuint32_t res32_1 = load_source(xi, yi); + svuint32_t res32_1 = load_source(calculate_nearest_coordinates(x)); svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), svreinterpret_u16_u32(res32_1)); x += kStep; - calculate_nearest_coordinates(x); - res32_0 = load_source(xi, yi); + res32_0 = load_source(calculate_nearest_coordinates(x)); x += kStep; - calculate_nearest_coordinates(x); - res32_1 = load_source(xi, yi); + res32_1 = load_source(calculate_nearest_coordinates(x)); svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), svreinterpret_u16_u32(res32_1)); svuint8_t result = @@ -182,31 +169,29 @@ void warp_perspective_operation(Rows src_rows, auto vector_path_nearest_tail = [&](size_t x, size_t x_max, Columns dst) { size_t length = x_max - x; - pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2); - pg64_t = svwhilelt_b64(0ULL, length / 2); - pg32 = svwhilelt_b32(0ULL, length); - calculate_nearest_coordinates(x); - svuint32_t result; + svbool_t pg32 = svwhilelt_b32(0ULL, length); + svuint32x2_t coords = calculate_nearest_coordinates(x); + svuint32_t xi = svget2(coords, 0); + svuint32_t yi = svget2(coords, 1); + + svuint32_t result; if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { - result = get_pixels_or_border(xi, yi); + result = get_pixels_or_border(pg32, xi, yi); } else { - // To avoid losing precision, the final indices use 64 bits - result = load_large(pg64_b, pg64_t, xi, yi); + result = load(pg32, xi, yi); } svst1b_u32(pg32, &dst[static_cast(x)], result); }; - auto calculate_linear_replicate = [&](uint32_t x) { + auto calculate_linear_replicate = [&](svbool_t pg, uint32_t x) { auto load_source = [&](svuint32_t x, svuint32_t y) { - if constexpr (IsLarge) { - return load_large(pg64_b, pg64_t, x, y); - } else { - return load_small(pg32, x, y); - } + return load(pg, x, y); }; - calculate_coordinates(x); + svfloat32x2_t coords = calculate_coordinates(x); + svfloat32_t xf = svget2(coords, 0); + svfloat32_t yf = svget2(coords, 1); // Take the integer part, clamp it to within the dimensions of the // source image (negative values are already saturated to 0) svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf)); @@ -245,56 +230,54 @@ void warp_perspective_operation(Rows src_rows, svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F))); }; - auto calculate_linear_constant_border = [&](uint32_t x) { - calculate_coordinates(x); + auto calculate_linear_constant_border = [&](svbool_t pg, uint32_t x) { + svfloat32x2_t coords = calculate_coordinates(x); + svfloat32_t xf = svget2(coords, 0); + svfloat32_t yf = svget2(coords, 1); // Convert obviously out-of-range coordinates to values that are just beyond // the largest permitted image width & height. This avoids the need for // special case handling elsewhere. svfloat32_t big = svdup_n_f32(1 << 24); - xf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, xf), big), xf, - big); - yf = svsel_f32(svcmple_f32(pg_all32, svabs_f32_x(pg_all32, yf), big), yf, - big); - - svfloat32_t xf0 = svrintm_f32_x(pg_all32, xf); - svfloat32_t yf0 = svrintm_f32_x(pg_all32, yf); - - svint32_t x0 = svcvt_s32_x(pg_all32, xf0); - svint32_t y0 = svcvt_s32_x(pg_all32, yf0); - svint32_t x1 = svadd_s32_x(pg_all32, x0, svdup_n_s32(1)); - svint32_t y1 = svadd_s32_x(pg_all32, y0, svdup_n_s32(1)); - - svfloat32_t xfrac = svsub_f32_x(pg_all32, xf, xf0); - svfloat32_t yfrac = svsub_f32_x(pg_all32, yf, yf0); - - svfloat32_t a = svcvt_f32_u32_x( - pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0), - svreinterpret_u32_s32(y0))); - svfloat32_t b = svcvt_f32_u32_x( - pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1), - svreinterpret_u32_s32(y0))); - svfloat32_t line0 = - svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); - svfloat32_t c = svcvt_f32_u32_x( - pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x0), - svreinterpret_u32_s32(y1))); - svfloat32_t d = svcvt_f32_u32_x( - pg_all32, get_pixels_or_border(svreinterpret_u32_s32(x1), - svreinterpret_u32_s32(y1))); - svfloat32_t line1 = - svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); - svfloat32_t result = svmla_f32_x( - pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); - return svcvt_u32_f32_x(pg_all32, svrinta_f32_x(pg_all32, result)); + xf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, xf), big), xf, big); + yf = svsel_f32(svcmple_f32(pg, svabs_f32_x(pg, yf), big), yf, big); + + svfloat32_t xf0 = svrintm_f32_x(pg, xf); + svfloat32_t yf0 = svrintm_f32_x(pg, yf); + + svint32_t x0 = svcvt_s32_x(pg, xf0); + svint32_t y0 = svcvt_s32_x(pg, yf0); + svint32_t x1 = svadd_s32_x(pg, x0, svdup_n_s32(1)); + svint32_t y1 = svadd_s32_x(pg, y0, svdup_n_s32(1)); + + svfloat32_t xfrac = svsub_f32_x(pg, xf, xf0); + svfloat32_t yfrac = svsub_f32_x(pg, yf, yf0); + + svfloat32_t a = + svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x0), + svreinterpret_u32_s32(y0))); + svfloat32_t b = + svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x1), + svreinterpret_u32_s32(y0))); + svfloat32_t line0 = svmla_f32_x(pg, a, svsub_f32_x(pg, b, a), xfrac); + svfloat32_t c = + svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x0), + svreinterpret_u32_s32(y1))); + svfloat32_t d = + svcvt_f32_u32_x(pg, get_pixels_or_border(pg, svreinterpret_u32_s32(x1), + svreinterpret_u32_s32(y1))); + svfloat32_t line1 = svmla_f32_x(pg, c, svsub_f32_x(pg, d, c), xfrac); + svfloat32_t result = + svmla_f32_x(pg, line0, svsub_f32_x(pg, line1, line0), yfrac); + return svcvt_u32_f32_x(pg, svrinta_f32_x(pg, result)); }; - auto calculate_linear = [&](uint32_t x) { + auto calculate_linear = [&](svbool_t pg, uint32_t x) { if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { - return calculate_linear_replicate(x); + return calculate_linear_replicate(pg, x); } else { static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); - return calculate_linear_constant_border(x); + return calculate_linear_constant_border(pg, x); } }; @@ -308,9 +291,6 @@ void warp_perspective_operation(Rows src_rows, ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5])); tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8])); - pg32 = pg_all32; - pg64_b = pg64_t = pg_all64; - LoopUnroll2 loop{dst_width, kStep}; if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); }); @@ -322,15 +302,15 @@ void warp_perspective_operation(Rows src_rows, } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { loop.unroll_four_times([&](size_t x) { ScalarType *p_dst = &dst[static_cast(x)]; - svuint32_t res0 = calculate_linear(x); + svuint32_t res0 = calculate_linear(pg_all32, x); x += kStep; - svuint32_t res1 = calculate_linear(x); + svuint32_t res1 = calculate_linear(pg_all32, x); svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); x += kStep; - res0 = calculate_linear(x); + res0 = calculate_linear(pg_all32, x); x += kStep; - res1 = calculate_linear(x); + res1 = calculate_linear(pg_all32, x); svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), svreinterpret_u16_u32(res1)); svst1_u8(svptrue_b8(), p_dst, @@ -339,18 +319,13 @@ void warp_perspective_operation(Rows src_rows, }); loop.unroll_once([&](size_t x) { ScalarType *p_dst = &dst[static_cast(x)]; - svuint32_t result = calculate_linear(x); + svuint32_t result = calculate_linear(pg_all32, x); svst1b_u32(pg_all32, p_dst, result); }); loop.remaining([&](size_t x, size_t x_max) { ScalarType *p_dst = &dst[static_cast(x)]; - size_t length = x_max - x; - pg32 = svwhilelt_b32(0ULL, length); - if constexpr (IsLarge) { - pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2); - pg64_t = svwhilelt_b64(0ULL, length / 2); - } - svuint32_t result = calculate_linear(x); + svbool_t pg32 = svwhilelt_b32(x, x_max); + svuint32_t result = calculate_linear(pg32, x); svst1b_u32(pg32, p_dst, result); }); } else { -- GitLab