From d2b365765bece0d8ad569ad4c31842620363d2d1 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Thu, 19 Dec 2024 07:49:49 +0000 Subject: [PATCH] Implement WarpPerspective Linear Interpolation --- CHANGELOG.md | 2 + adapters/opencv/kleidicv_hal.cpp | 26 +- adapters/opencv/kleidicv_hal.h | 5 - adapters/opencv/opencv-4.10.patch | 33 +- adapters/opencv/opencv-5.x.patch | 32 +- benchmark/benchmark.cpp | 37 +- conformity/opencv/test_warp_perspective.cpp | 20 +- doc/functionality.md | 6 + doc/opencv.md | 11 + kleidicv/CMakeLists.txt | 1 - kleidicv/include/kleidicv/config.h.in | 2 - kleidicv/include/kleidicv/kleidicv.h | 8 +- .../kleidicv/transform/warp_perspective.h | 11 +- kleidicv/include/kleidicv/utils.h | 29 +- .../src/transform/warp_perspective_api.cpp | 6 +- .../src/transform/warp_perspective_neon.cpp | 346 +++++++---- kleidicv/src/transform/warp_perspective_sc.h | 407 +++++++----- .../src/transform/warp_perspective_sve2.cpp | 3 +- .../include/kleidicv_thread/kleidicv_thread.h | 1 + kleidicv_thread/src/kleidicv_thread.cpp | 5 +- scripts/benchmark/benchmarks.txt | 6 +- test/api/test_thread.cpp | 31 +- test/api/test_warp_perspective.cpp | 585 ++++++++++++++++-- test/framework/array.h | 47 +- 24 files changed, 1218 insertions(+), 442 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d96fc0663..0e8807ac0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,8 @@ This changelog aims to follow the guiding principles of ### Added - Implementation of Rotate 90 degrees clockwise. - Remap function with nearest neighbour and fixed-point interpolation for 8-bit inputs. +- WarpPerspective implementation + - Nearest and Linear interpolation method, for replicated borders and 1-channel u8 input. ### Changed - Increased precision of sum for 32 bit floats and expose it to OpenCV HAL. diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 0a6aa9c6e..4c57b0857 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -353,7 +353,6 @@ static int from_opencv(int opencv_border_type, return 0; } -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE // Converts an OpenCV interpolation type to a KleidiCV interpolation type. static int from_opencv(int opencv_interpolation, kleidicv_interpolation_type_t &interpolation_type) { @@ -361,19 +360,15 @@ static int from_opencv(int opencv_interpolation, default: return 1; case CV_HAL_INTER_NEAREST: - interpolation_type = - kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_NEAREST; + interpolation_type = KLEIDICV_INTERPOLATION_NEAREST; break; case CV_HAL_INTER_LINEAR: - interpolation_type = - kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_LINEAR; + interpolation_type = KLEIDICV_INTERPOLATION_LINEAR; break; } return 0; } -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE - struct SeparableFilter2DParams { size_t channels; kleidicv_border_type_t border_type; @@ -1421,24 +1416,23 @@ int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data, src, src_step, width + 2, height + 2, cn, dst_data, dst_step, mt)); } -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE int warp_perspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, - const double transformation[9], int interpolation, + const double transformation_f64[9], int interpolation, int border_type, const double border_value_f64[4]) { kleidicv_border_type_t kleidicv_border_type; if (from_opencv(border_type, kleidicv_border_type)) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } - float float_transformation[9]; + float transformation[9]; for (size_t i = 0; i < 9; ++i) { - float_transformation[i] = static_cast(transformation[i]); + transformation[i] = static_cast(transformation_f64[i]); } - kleidicv_interpolation_type_t kleidicv_interpolation_type; - if (from_opencv(interpolation, kleidicv_interpolation_type)) { + kleidicv_interpolation_type_t kleidicv_interpolation; + if (from_opencv(interpolation, kleidicv_interpolation)) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } @@ -1451,12 +1445,10 @@ int warp_perspective(int src_type, const uchar *src_data, size_t src_step, src_data, src_step, static_cast(src_width), static_cast(src_height), dst_data, dst_step, static_cast(dst_width), static_cast(dst_height), - float_transformation, 1, kleidicv_interpolation_type, - kleidicv_border_type, border_value.data(), mt)); + transformation, 1, kleidicv_interpolation, kleidicv_border_type, + border_value.data(), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE - } // namespace kleidicv::hal diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index 18cc07e2a..2a5f04e31 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -158,13 +158,11 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, const uint16_t *mapfrac, size_t mapfrac_step, int border_type, const double border_value[4]); -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE int warp_perspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double transformation[9], int interpolation, int borderType, const double borderValue[4]); -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data, size_t dst_step, int width, int height, int cn); @@ -432,7 +430,6 @@ static inline int kleidicv_pyrdown_with_fallback( #undef cv_hal_pyrdown #define cv_hal_pyrdown kleidicv_pyrdown_with_fallback -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE // warp_perspective static inline int kleidicv_warp_perspective_with_fallback( int src_type, const uchar *src_data, size_t src_step, int src_width, @@ -448,8 +445,6 @@ static inline int kleidicv_warp_perspective_with_fallback( #undef cv_hal_warpPerspective #define cv_hal_warpPerspective kleidicv_warp_perspective_with_fallback -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE - #endif // OPENCV_IMGPROC_HAL_REPLACEMENT_HPP #ifdef OPENCV_CORE_HAL_REPLACEMENT_HPP diff --git a/adapters/opencv/opencv-4.10.patch b/adapters/opencv/opencv-4.10.patch index 5283cbb8f..85599737b 100644 --- a/adapters/opencv/opencv-4.10.patch +++ b/adapters/opencv/opencv-4.10.patch @@ -19,7 +19,7 @@ index 2b4035285f..729cd1dd43 100644 @@ -281,6 +281,11 @@ void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const dst.create(dims, size, dtype); Mat dstMat = dst.getMat(); - + + if( dims <= 2 ) { + int width_in_elements = src.cols * cn; + CALL_HAL(convertTo, cv_hal_convertTo, src.data, src.step, src.depth(), dstMat.data, dstMat.step, dstMat.depth(), width_in_elements, src.rows, alpha, beta); @@ -35,7 +35,7 @@ index f78608dbad..a9384588ec 100644 @@ -953,6 +953,53 @@ inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst #define cv_hal_transpose2d hal_ni_transpose2d //! @endcond - + +/** + @brief sum + @param src_data,src_step,src_type Source image @@ -84,8 +84,8 @@ index f78608dbad..a9384588ec 100644 +//! @endcond + //! @} - - + + diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp index 8c6d8ad9a9..47eb6fdb66 100644 --- a/modules/core/src/minmax.cpp @@ -124,7 +124,7 @@ index 773fed9b48..b74ff70f99 100644 @@ -328,6 +328,60 @@ inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step, #define cv_hal_remap32f hal_ni_remap32f //! @endcond - + +/** + @brief hal_remap with a short integer map + @param src_type source and destination image type @@ -199,7 +199,7 @@ index d7c9c64c3c..348208b72d 100644 + CALL_HAL(remap16s16u, cv_hal_remap16s16u, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, + map1.ptr(), map1.step, map2.ptr(), map2.step, borderType, borderValue.val); } - + interpolation &= ~WARP_RELATIVE_MAP; diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp index d0f50a73bb..1c308887dc 100644 @@ -208,7 +208,7 @@ index d0f50a73bb..1c308887dc 100644 @@ -654,6 +654,25 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType) ); - + + { + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); @@ -316,12 +316,12 @@ index 6d51c0cf1a..0e6f6a324e 100644 +++ b/modules/video/src/lkpyramid.cpp @@ -50,6 +50,7 @@ #endif - + #include "opencv2/core/openvx/ovx_defs.hpp" +#include "hal_replacement.hpp" - + #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) - + @@ -62,6 +63,9 @@ static void calcScharrDeriv(const cv::Mat& src, cv::Mat& dst) int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth(); CV_Assert(depth == CV_8U); @@ -331,4 +331,15 @@ index 6d51c0cf1a..0e6f6a324e 100644 + parallel_for_(Range(0, rows), cv::detail::ScharrDerivInvoker(src, dst), cv::getNumThreads()); } - + +diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp +index 83a5e23781..75df962fb7 100644 +--- a/modules/imgproc/test/test_imgwarp_strict.cpp ++++ b/modules/imgproc/test/test_imgwarp_strict.cpp +@@ -239,7 +239,7 @@ float CV_ImageWarpBaseTest::get_success_error_level(int _interpolation, int) con + else if (_interpolation == INTER_AREA) + return 2.0f; + else +- return 1.0f; ++ return 4.0f; // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test + } diff --git a/adapters/opencv/opencv-5.x.patch b/adapters/opencv/opencv-5.x.patch index cc7f77c75..3f0298d27 100644 --- a/adapters/opencv/opencv-5.x.patch +++ b/adapters/opencv/opencv-5.x.patch @@ -27,7 +27,7 @@ index 56767f2ec7..5ade43b063 100644 @@ -949,6 +951,13 @@ if(NOT DEFINED OpenCV_HAL) set(OpenCV_HAL "OpenCV_HAL") endif() - + +if(WITH_KLEIDICV) + ocv_debug_message(STATUS "Enable KleidiCV acceleration") + if(NOT ";${OpenCV_HAL};" MATCHES ";kleidicv;") @@ -55,7 +55,7 @@ index 3c61ae1c9a..f44e7d8dc2 100644 +++ b/doc/tutorials/introduction/config_reference/config_reference.markdown @@ -621,6 +621,7 @@ Following build options are utilized in `opencv_contrib` modules, as stated [pre `CMAKE_TOOLCHAIN_FILE` - + `WITH_CAROTENE` +`WITH_KLEIDICV` `WITH_CPUFEATURES` @@ -66,14 +66,14 @@ index c7445a4de4..49a459597b 100644 --- a/modules/core/include/opencv2/core/hal/interface.h +++ b/modules/core/include/opencv2/core/hal/interface.h @@ -90,6 +90,8 @@ typedef short cv_hal_bf16; - + #define CV_MAT_DEPTH_MASK (CV_DEPTH_MAX - 1) #define CV_MAT_DEPTH(flags) ((flags) & CV_MAT_DEPTH_MASK) +#define CV_MAT_CN_MASK ((CV_CN_MAX - 1) << CV_CN_SHIFT) +#define CV_MAT_CN(flags) ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1) #define CV_IS_INT_TYPE(flags) (((1 << CV_MAT_DEPTH(flags)) & 0x1e1f) != 0) #define CV_IS_FLOAT_TYPE(flags) (((1 << CV_MAT_DEPTH(flags)) & 0x1e0) != 0) - + diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp index 4a8432fb0e..82cecf90c0 100644 --- a/modules/core/src/convert.dispatch.cpp @@ -81,7 +81,7 @@ index 4a8432fb0e..82cecf90c0 100644 @@ -195,6 +195,11 @@ void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const dst.create( dims, size, dtype, -1, allowTransposed ); Mat dstMat = dst.getMat(); - + + if( dims <= 2 ) { + int width_in_elements = src.cols * cn; + CALL_HAL(convertTo, cv_hal_convertTo, src.data, src.step, src.depth(), dstMat.data, dstMat.step, dstMat.depth(), width_in_elements, src.rows, alpha, beta); @@ -97,7 +97,7 @@ index dc65a25684..5be57f0923 100644 @@ -968,6 +968,21 @@ inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst #define cv_hal_transpose2d hal_ni_transpose2d //! @endcond - + +/** + @brief convertTo + @param src_data,src_step,src_depth Source image @@ -114,8 +114,8 @@ index dc65a25684..5be57f0923 100644 +//! @endcond + //! @} - - + + diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp index 5c6497bd80..43e89d5b9c 100644 --- a/modules/imgproc/src/hal_replacement.hpp @@ -123,7 +123,7 @@ index 5c6497bd80..43e89d5b9c 100644 @@ -880,6 +880,12 @@ inline int hal_ni_gaussianBlur(const uchar* src_data, size_t src_step, uchar* ds #define cv_hal_gaussianBlur hal_ni_gaussianBlur //! @endcond - + +inline int hal_ni_gaussianBlurBinomial(const uchar*, size_t, uchar*, size_t, int, int, int, int, size_t, size_t, size_t, size_t, size_t, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } + +//! @cond IGNORED @@ -140,7 +140,7 @@ index 3e23187de2..45737b1ee3 100644 @@ -570,6 +570,20 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType) ); - + + if (sigma1 == 0.0 && sigma2 == 0.0 && ksize.height == ksize.width) + { + Mat src = _src.getMat(); @@ -158,3 +158,15 @@ index 3e23187de2..45737b1ee3 100644 if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix())) { std::vector fkx, fky; + +diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp +index 83a5e23781..75df962fb7 100644 +--- a/modules/imgproc/test/test_imgwarp_strict.cpp ++++ b/modules/imgproc/test/test_imgwarp_strict.cpp +@@ -239,7 +239,7 @@ float CV_ImageWarpBaseTest::get_success_error_level(int _interpolation, int) con + else if (_interpolation == INTER_AREA) + return 2.0f; + else +- return 1.0f; ++ return 4.0f; // Reference algorithm uses integer interpolation with 5 bits fractional part, and this greater tolerance allows better precision algorithms to pass the test + } diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index bf178d1f5..94cff9f28 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -677,7 +677,6 @@ BENCH_REMAP_S16POINT5(remap_s16point5_u8_identity, remap_s16point5_u8, get_identity_mapxy, 1, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE // clang-format off static const float transform_identity[] = { 1.0, 0, 0, @@ -735,23 +734,45 @@ static void warp_perspective(Function f, const float transform[9], } \ BENCHMARK(benchname) -BENCH_WARP_PERSPECTIVE(warp_perspective_u8_identity, warp_perspective_u8, - transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST, +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_identity, + warp_perspective_u8, transform_identity, 1, + KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); -BENCH_WARP_PERSPECTIVE(warp_perspective_u8_small, warp_perspective_u8, +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_small, warp_perspective_u8, transform_small, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); -BENCH_WARP_PERSPECTIVE(warp_perspective_u8_bend, warp_perspective_u8, +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_bend, warp_perspective_u8, transform_bend, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); -BENCH_WARP_PERSPECTIVE(warp_perspective_u8_rotate, warp_perspective_u8, +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_rotate, warp_perspective_u8, transform_rotate, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); -BENCH_WARP_PERSPECTIVE(warp_perspective_u8_near, warp_perspective_u8, +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_nearest_near, warp_perspective_u8, transform_near, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE + +// WarpPerspective Linear + +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_identity, warp_perspective_u8, + transform_identity, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_small, warp_perspective_u8, + transform_small, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_bend, warp_perspective_u8, + transform_bend, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_rotate, warp_perspective_u8, + transform_rotate, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_WARP_PERSPECTIVE(warp_perspective_u8_linear_near, warp_perspective_u8, + transform_near, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); diff --git a/conformity/opencv/test_warp_perspective.cpp b/conformity/opencv/test_warp_perspective.cpp index 775e3d435..af57e1b1d 100644 --- a/conformity/opencv/test_warp_perspective.cpp +++ b/conformity/opencv/test_warp_perspective.cpp @@ -30,7 +30,7 @@ cv::Mat exec_warp_perspective(cv::Mat& source_mat) { } #if MANAGER -const int kMaxHeight = 42, kMaxWidth = 42; +const int kMaxHeight = 16, kMaxWidth = 65; template @@ -39,11 +39,20 @@ bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue, for (size_t w = 8; w <= kMaxWidth; w += 3) { for (size_t h = 8; h <= kMaxHeight; h += 4) { cv::Mat source_mat(h, w, Format); + // Perspective calculation in float greatly amplifies any small errors + // coming from precision innaccuracies, let it be Fused-Multiply-Add or + // just the limitation of the single precision float. Taking the + // fractional part of a value in the thousands' range decreases the + // precision by 3 decimal digits, and the single precision float only has + // 7 decimal digits. Doing a linear interpolation between 0 and 255 would + // decrease it even more, but ensuring that neighbouring pixels have + // neighbouring values decreases this effect by more than 2 decimal digits + // so it can be expected that the error won't be bigger than 1. for (size_t row = 0; row < h; ++row) { for (size_t column = 0; column < w; ++column) { + const int kMaxVal = std::numeric_limits::max(); source_mat.at(row, column) = - (row * (w + 12) + column) % - std::numeric_limits::max(); + abs(static_cast(row + column) % (2 * kMaxVal + 1) - kMaxVal); } } @@ -56,7 +65,7 @@ bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue, bool success = !are_matrices_different(1, actual_mat, expected_mat); if (!success) { - fail_print_matrices(w, h, source_mat, actual_mat, expected_mat); + fail_print_matrices(h, w, source_mat, actual_mat, expected_mat); return true; } } @@ -68,7 +77,8 @@ bool test_warp_perspective(int index, RecreatedMessageQueue& request_queue, std::vector& warp_perspective_tests_get() { // clang-format off static std::vector tests = { - TEST("WarpPerspective uint8", (test_warp_perspective), (exec_warp_perspective)), + TEST("WarpPerspectiveNearest uint8", (test_warp_perspective), (exec_warp_perspective)), + TEST("WarpPerspectiveLinear uint8", (test_warp_perspective), (exec_warp_perspective)), }; // clang-format on return tests; diff --git a/doc/functionality.md b/doc/functionality.md index 71f4860d8..3271c47ff 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -97,3 +97,9 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. |--------------------------------------------|-----|-----| | Remap int16 coordinates | x | | | Remap int16+uint16 fixed-point coordinates | x | | + +# WarpPerspective +| | u8 | u16 | +|------------------------------------------|-----|-----| +| Nearest neighbour, replicated borders | x | | +| Linear interpolation, replicated borders | x | | diff --git a/doc/opencv.md b/doc/opencv.md index 2c04dc4ec..bbc886fff 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -218,6 +218,17 @@ Supported map configurations: * `map1` is 16SC2 and `map2` is 16UC1: `map1` is as above, `map2` contains combined 5+5 bits of x (low) and y (high) fractions, i.e. x = x1 + x2 / 2^5 * supported `interpolation`: `INTER_LINEAR` only +### [`cv::warpPerspective()`](https://docs.opencv.org/4.10.0/da/d54/group__imgproc__transform.html#gaf73673a7e8e18ec6963e3774e6a94b87) +Performs a perspective transformation on an image. + +Notes on parameters: +* `src.depth()` - only supports `CV_8U` depth and 1 channel. +* `src.cols`, `src.rows`, `dst.cols`, `dst.rows` - must be less than 2^24 +* `src.step` - must be less than 2^32 +* `dst.cols` - must be at least 8 +* `borderMode` - only supports `BORDER_REPLICATE` +* `interpolation` - supports `INTER_NEAREST` and `INTER_LINEAR` + ### [`cv::pyrDown()`](https://docs.opencv.org/4.10.0/d4/d86/group__imgproc__filter.html#gaf9bba239dfca11654cb7f50f889fc2ff) Blurs and downsamples an image. diff --git a/kleidicv/CMakeLists.txt b/kleidicv/CMakeLists.txt index 1412394ab..0b0fa2c81 100644 --- a/kleidicv/CMakeLists.txt +++ b/kleidicv/CMakeLists.txt @@ -50,7 +50,6 @@ option(KLEIDICV_ASSUME_128BIT_SVE2 "Internal - If turned ON 128-bit SVE2 vector option(KLEIDICV_PREFER_INTERLEAVING_LOAD_STORE "Internal - If turned ON interleaving loads and stores are preferred instead of continuous loads and stores" OFF) option(KLEIDICV_EXPERIMENTAL_FEATURE_CANNY "Internal - Enable experimental Canny algorithm" OFF) option(KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV "Internal - If turned ON Canny algorithm creates bit exact result compared to OpenCV's original implementation" ON) -option(KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE "Internal - Enable experimental WarpPerspective algorithm" OFF) if(KLEIDICV_ENABLE_SME2 AND NOT KLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS) set(KLEIDICV_ALWAYS_ENABLE_SME2 ON) diff --git a/kleidicv/include/kleidicv/config.h.in b/kleidicv/include/kleidicv/config.h.in index 3e28a4535..18a0970f3 100644 --- a/kleidicv/include/kleidicv/config.h.in +++ b/kleidicv/include/kleidicv/config.h.in @@ -21,8 +21,6 @@ #cmakedefine01 KLEIDICV_CANNY_ALGORITHM_CONFORM_OPENCV -#cmakedefine01 KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE - // Set to '1' if compiling NEON code paths, otherwise it is set to '0'. #ifndef KLEIDICV_TARGET_NEON #define KLEIDICV_TARGET_NEON 0 diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index bd4e38801..3c95ef4c3 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1870,7 +1870,6 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8( size_t src_channels, int16_t *dst, size_t dst_stride); #endif // DOXYGEN -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE /// Performs a perspective transformation on an image. /// For each pixel in `dst` take a pixel from `src` specified by /// the transformed x and y coordinates, and optionally doing a bilinear @@ -1894,12 +1893,14 @@ kleidicv_error_t kleidicv_scharr_interleaved_s16_u8( /// Must be a multiple of `sizeof(type)` and no less than /// `width * sizeof(type)`, except for single-row images. /// @param dst_width Number of elements in the destination row. Must be at -/// least 8. -/// @param dst_height Number of rows in the destination data. +/// least 8. Must be less than 2^24. +/// @param dst_height Number of rows in the destination data. Must be less +/// than 2^24. /// @param transformation Pointer to the transformation matrix of 9 values. /// @param channels Number of channels in the data. Must be 1. /// @param interpolation Interpolation algorithm. Supported types: \n /// - @ref KLEIDICV_INTERPOLATION_NEAREST +/// - @ref KLEIDICV_INTERPOLATION_LINEAR /// @param border_type Way of handling the border. The supported border types /// are: \n /// - @ref KLEIDICV_BORDER_TYPE_REPLICATE @@ -1911,7 +1912,6 @@ kleidicv_error_t kleidicv_warp_perspective_u8( const float transformation[9], size_t channels, kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value); -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE #ifdef __cplusplus } // extern "C" diff --git a/kleidicv/include/kleidicv/transform/warp_perspective.h b/kleidicv/include/kleidicv/transform/warp_perspective.h index e5fe74526..45b512764 100644 --- a/kleidicv/include/kleidicv/transform/warp_perspective.h +++ b/kleidicv/include/kleidicv/transform/warp_perspective.h @@ -24,18 +24,17 @@ KLEIDICV_API_DECLARATION(kleidicv_warp_perspective_stripe_u8, kleidicv_border_type_t border_type, const uint8_t *border_value); } - namespace kleidicv { template inline bool warp_perspective_is_implemented( - size_t dst_width, kleidicv_interpolation_type_t interpolation, - kleidicv_border_type_t border_type, - size_t channels) KLEIDICV_STREAMING_COMPATIBLE { + size_t dst_width, size_t channels, + kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t border_type) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value) { return (dst_width >= 8 && - interpolation == - kleidicv_interpolation_type_t::KLEIDICV_INTERPOLATION_NEAREST && + (interpolation == KLEIDICV_INTERPOLATION_NEAREST || + interpolation == KLEIDICV_INTERPOLATION_LINEAR) && border_type == kleidicv_border_type_t::KLEIDICV_BORDER_TYPE_REPLICATE && channels == 1); diff --git a/kleidicv/include/kleidicv/utils.h b/kleidicv/include/kleidicv/utils.h index e327cf938..e95ce94ea 100644 --- a/kleidicv/include/kleidicv/utils.h +++ b/kleidicv/include/kleidicv/utils.h @@ -301,16 +301,27 @@ class LoopUnroll2 final { const size_t n_step = UnrollFactor * step(); size_t max_index = index_ + (remaining_length() / n_step) * n_step; + // A tail mechanism is built into the single vector processing loop, if + // enabled. The single vector path is executed iteratively, and at the end + // it rewinds the loop to one vector before the end of the data, and + // executes one final vector path, so the scalar path can be omitted. if constexpr (try_to_avoid_tail_loop && (UnrollFactor == 1)) { - while (index_ < max_index) { - while (index_ < max_index) { - callback(index_); - index_ += n_step; - } - - if (remaining_length()) { - index_ = length_ - n_step; - max_index = length_; + // Enter this loop only if there's enough data for a full vector + if (length_ >= n_step) { + // External loop only ends when all data has been processed + while (index_ < length_) { + // Internal loop checks if the vector path can be executed + while (index_ < max_index) { + callback(index_); + index_ += n_step; + } + // Check if a final iteration is needed. The double loop is needed to + // avoid the repetition of the callback function, which is usually + // inlined into the binary. (Save some code space) + if (remaining_length()) { + index_ = length_ - n_step; + max_index = length_; + } } } } else { diff --git a/kleidicv/src/transform/warp_perspective_api.cpp b/kleidicv/src/transform/warp_perspective_api.cpp index 797b603c4..16b80e121 100644 --- a/kleidicv/src/transform/warp_perspective_api.cpp +++ b/kleidicv/src/transform/warp_perspective_api.cpp @@ -2,12 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "kleidicv/ctypes.h" #include "kleidicv/dispatch.h" #include "kleidicv/kleidicv.h" #include "kleidicv/transform/warp_perspective.h" -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE - KLEIDICV_MULTIVERSION_C_API(kleidicv_warp_perspective_stripe_u8, &kleidicv::neon::warp_perspective_stripe, &kleidicv::sve2::warp_perspective_stripe, @@ -22,7 +21,7 @@ kleidicv_error_t kleidicv_warp_perspective_u8( kleidicv_interpolation_type_t interpolation, kleidicv_border_type_t border_type, const uint8_t *border_value) { if (!kleidicv::warp_perspective_is_implemented( - dst_width, interpolation, border_type, channels)) { + dst_width, channels, interpolation, border_type)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } @@ -33,4 +32,3 @@ kleidicv_error_t kleidicv_warp_perspective_u8( } } // extern "C" -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE diff --git a/kleidicv/src/transform/warp_perspective_neon.cpp b/kleidicv/src/transform/warp_perspective_neon.cpp index 7c84ee2d4..bba3f7e82 100644 --- a/kleidicv/src/transform/warp_perspective_neon.cpp +++ b/kleidicv/src/transform/warp_perspective_neon.cpp @@ -2,13 +2,14 @@ // // SPDX-License-Identifier: Apache-2.0 +#include + #include "kleidicv/ctypes.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" #include "kleidicv/traits.h" #include "kleidicv/transform/warp_perspective.h" - -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE +#include "kleidicv/utils.h" namespace kleidicv::neon { @@ -29,112 +30,207 @@ namespace kleidicv::neon { // xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8) // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) // -template -class WarpPerspective; - -template -class WarpPerspective { - public: - using ScalarType = uint8_t; - using CoordVecTraits = VecTraits; - using CoordVector = CoordVecTraits::VectorType; - - WarpPerspective(Rows src_rows, size_t src_width, - size_t src_height) - : src_rows_{src_rows}, - src_height_{src_height}, - v_src_stride_{vdup_n_u32(static_cast(src_rows_.stride()))}, - vq_src_stride_{vdupq_n_u32(static_cast(src_rows_.stride()))}, - x0123_{vld1q(first_few_x)}, - v_xmax_{vdupq_n_f32(static_cast(src_width - 1))}, - v_ymax_{vdupq_n_f32(static_cast(src_height - 1))} {} - - void process_row(size_t y, size_t width, const float transform[9], - Columns dst) { + +typedef struct { + float32x4_t x, y; +} CoordVectorPair; + +template +void warp_perspective_operation(Rows src_rows, + size_t src_width, size_t src_height, + const float transform[9], + Rows dst_rows, size_t dst_width, + size_t y_begin, size_t y_end) { + static constexpr uint32_t first_few_x[] = {0, 1, 2, 3}; + uint32x4_t x0123_ = vld1q_u32(first_few_x); + + uint32x2_t v_src_stride = + vdup_n_u32(static_cast(src_rows.stride())); + uint32x4_t vq_src_stride = + vdupq_n_u32(static_cast(src_rows.stride())); + uint32x4_t v_xmax = vdupq_n_u32(static_cast(src_width - 1)); + uint32x4_t v_ymax = vdupq_n_u32(static_cast(src_height - 1)); + + float32x4_t tx0, ty0, tw0; + + auto calculate_coordinates = [&](uint32_t x) { + // The next few values can be calculated by adding the corresponding Tn*x + float32x4_t fx = vcvtq_f32_u32(vaddq_u32(x0123_, vdupq_n_u32(x))); + float32x4_t tx = vmlaq_n_f32(tx0, fx, transform[0]); + float32x4_t ty = vmlaq_n_f32(ty0, fx, transform[3]); + float32x4_t tw = vmlaq_n_f32(tw0, fx, transform[6]); + + // Calculate inverse weight because division is expensive + float32x4_t iw = vdivq_f32(vdupq_n_f32(1.F), tw); + // Calculate coordinates into the source image + float32x4_t xf = vmulq_f32(tx, iw); + float32x4_t yf = vmulq_f32(ty, iw); + return CoordVectorPair{xf, yf}; + }; + + auto vector_path_nearest_small = [&](uint32_t x, Columns dst) { + auto &&[xf, yf] = calculate_coordinates(x); + // Take the integer part, clamp it to within the dimensions of the + // source image (negative values are already saturated to 0) + uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf)); + uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf)); + // Calculate offsets from coordinates (y * stride + x) + // Use this path only when the final offsets fit into 32 bits + uint32x4_t offsets = vmlaq_u32(xi, yi, vq_src_stride); + // Copy pixels from source + ptrdiff_t ix = static_cast(x); + dst[ix] = src_rows[vgetq_lane_u32(offsets, 0)]; + dst[ix + 1] = src_rows[vgetq_lane_u32(offsets, 1)]; + dst[ix + 2] = src_rows[vgetq_lane_u32(offsets, 2)]; + dst[ix + 3] = src_rows[vgetq_lane_u32(offsets, 3)]; + }; + + auto vector_path_nearest_large = [&](uint32_t x, Columns dst) { + auto &&[xf, yf] = calculate_coordinates(x); + // Take the integer part, clamp it to within the dimensions of the + // source image (negative values are already saturated to 0) + uint32x4_t xi = vminq_u32(v_xmax, vcvtaq_u32_f32(xf)); + uint32x4_t yi = vminq_u32(v_ymax, vcvtaq_u32_f32(yf)); + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + uint64x2_t offsets_low = + vmlal_u32(vmovl_u32(vget_low_u32(xi)), vget_low_u32(yi), v_src_stride); + uint64x2_t offsets_high = + vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride); + // Copy pixels from source + ptrdiff_t ix = static_cast(x); + dst[ix] = src_rows[vgetq_lane_u64(offsets_low, 0)]; + dst[ix + 1] = src_rows[vgetq_lane_u64(offsets_low, 1)]; + dst[ix + 2] = src_rows[vgetq_lane_u64(offsets_high, 0)]; + dst[ix + 3] = src_rows[vgetq_lane_u64(offsets_high, 1)]; + }; + + auto load_src_into_floats_small = [&](uint32x4_t x, uint32x4_t y) { + uint32x4_t offset = vmlaq_u32(x, y, vq_src_stride); + uint64_t acc = + static_cast(src_rows[vgetq_lane_u32(offset, 0)]) | + (static_cast(src_rows[vgetq_lane_u32(offset, 1)]) << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = static_cast(src_rows[vgetq_lane_u32(offset, 2)]) | + (static_cast(src_rows[vgetq_lane_u32(offset, 3)]) << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + }; + + auto load_src_into_floats_large = [&](uint32x4_t x, uint32x4_t y) { + uint64x2_t offset_low = + vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), v_src_stride); + uint64x2_t offset_high = + vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), v_src_stride); + uint64_t acc = + static_cast(src_rows[vgetq_lane_u64(offset_low, 0)]) | + (static_cast(src_rows[vgetq_lane_u64(offset_low, 1)]) << 32); + uint64x2_t rawsrc = vdupq_n_u64(acc); + acc = + static_cast(src_rows[vgetq_lane_u64(offset_high, 0)]) | + (static_cast(src_rows[vgetq_lane_u64(offset_high, 1)]) << 32); + rawsrc = vsetq_lane_u64(acc, rawsrc, 1); + return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); + }; + + auto calculate_linear = [&](uint32_t x) { + auto load_floats = [&](uint32x4_t x, uint32x4_t y) { + if constexpr (IsLarge) { + return load_src_into_floats_large(x, y); + } else { + return load_src_into_floats_small(x, y); + } + }; + auto &&[xf, yf] = calculate_coordinates(x); + // Truncating convert to int + uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); + uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); + + // Get fractional part, or 0 if out of range + float32x4_t zero = vdupq_n_f32(0.F); + uint32x4_t x_in_range = + vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); + uint32x4_t y_in_range = + vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); + float32x4_t xfrac = + vbslq_f32(x_in_range, vsubq_f32(xf, vrndmq_f32(xf)), zero); + float32x4_t yfrac = + vbslq_f32(y_in_range, vsubq_f32(yf, vrndmq_f32(yf)), zero); + + // x1 = x0 + 1, except if it's already xmax or out of range + uint32x4_t x1 = vsubq_u32(x0, x_in_range); + uint32x4_t y1 = vsubq_u32(y0, y_in_range); + + // Calculate offsets from coordinates (y * stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + float32x4_t a = load_floats(x0, y0); + float32x4_t b = load_floats(x1, y0); + float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); + float32x4_t c = load_floats(x0, y1); + float32x4_t d = load_floats(x1, y1); + float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); + float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); + return vminq_u32(vdupq_n_u32(0xFF), vcvtaq_u32_f32(result)); + }; + + auto process_row = [&](uint32_t y, Columns dst) { float dy = static_cast(y); // Calculate half-transformed values at the first pixel (nominators) // tw = T6*x + T7*y + T8 // tx = (T0*x + T1*y + T2) / tw // ty = (T3*x + T4*y + T5) / tw - float x0 = transform[1] * dy + transform[2]; - float y0 = transform[4] * dy + transform[5]; - float w0 = transform[7] * dy + transform[8]; - // The next few values can be calculated by adding the corresponding Tn*x - CoordVector tx0 = - vmlaq_f32(vdupq_n_f32(x0), x0123_, vdupq_n_f32(transform[0])); - CoordVector ty0 = - vmlaq_f32(vdupq_n_f32(y0), x0123_, vdupq_n_f32(transform[3])); - CoordVector tw0 = - vmlaq_f32(vdupq_n_f32(w0), x0123_, vdupq_n_f32(transform[6])); - - // (Nearest) integer part of transformed coordinates - uint32x4_t xi, yi; - - auto calculate_coordinates = [&](size_t x) { - float fx = static_cast(x); - // Calculate half-transformed values from the first few pixel values, plus - // Tn*x, similarly to the one above - CoordVector tx = vaddq_f32(tx0, vdupq_n_f32(transform[0] * fx)); - CoordVector ty = vaddq_f32(ty0, vdupq_n_f32(transform[3] * fx)); - CoordVector tw = vaddq_f32(tw0, vdupq_n_f32(transform[6] * fx)); - - // Calculate inverse weight because division is expensive - CoordVector iw = vdivq_f32(vdupq_n_f32(1.F), tw); - // Calc and clamp coordinates to within the dimensions of the source image - CoordVector xf = - vmaxq_f32(vdupq_n_f32(0.F), vminq_f32(vmulq_f32(tx, iw), v_xmax_)); - CoordVector yf = - vmaxq_f32(vdupq_n_f32(0.F), vminq_f32(vmulq_f32(ty, iw), v_ymax_)); - // Rounding convert to int - xi = vcvtaq_u32_f32(xf); - yi = vcvtaq_u32_f32(yf); - }; - - auto large_vector_path = [&](size_t x) { - calculate_coordinates(x); - // Calculate offsets from coordinates (y * stride + x) - // To avoid losing precision, the final indices should be in 64 bits - uint64x2_t indices_low = vmlal_u32(vmovl_u32(vget_low_u32(xi)), - vget_low_u32(yi), v_src_stride_); - uint64x2_t indices_high = - vmlal_u32(vmovl_high_u32(xi), vget_high_u32(yi), v_src_stride_); - // Copy pixels from source - ptrdiff_t ix = static_cast(x); - dst[ix] = src_rows_[vgetq_lane_u64(indices_low, 0)]; - dst[ix + 1] = src_rows_[vgetq_lane_u64(indices_low, 1)]; - dst[ix + 2] = src_rows_[vgetq_lane_u64(indices_high, 0)]; - dst[ix + 3] = src_rows_[vgetq_lane_u64(indices_high, 1)]; - }; - - auto small_vector_path = [&](size_t x) { - calculate_coordinates(x); - // Calculate offsets from coordinates (y * stride + x) - // Use this path only when the final indices fit into 32 bits - uint32x4_t indices = vmlaq_u32(xi, yi, vq_src_stride_); - // Copy pixels from source - ptrdiff_t ix = static_cast(x); - dst[ix] = src_rows_[vgetq_lane_u32(indices, 0)]; - dst[ix + 1] = src_rows_[vgetq_lane_u32(indices, 1)]; - dst[ix + 2] = src_rows_[vgetq_lane_u32(indices, 2)]; - dst[ix + 3] = src_rows_[vgetq_lane_u32(indices, 3)]; - }; + tx0 = vdupq_n_f32(transform[1] * dy + transform[2]); + ty0 = vdupq_n_f32(transform[4] * dy + transform[5]); + tw0 = vdupq_n_f32(transform[7] * dy + transform[8]); - LoopUnroll2 loop{width, CoordVecTraits::num_lanes()}; - if constexpr (IsLarge) { - loop.unroll_once(large_vector_path); + static const size_t kStep = VecTraits::num_lanes(); + LoopUnroll2 loop{dst_width, kStep}; + if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { + if constexpr (IsLarge) { + loop.unroll_once([&](size_t x) { + vector_path_nearest_large(static_cast(x), dst); + }); + } else { + loop.unroll_once([&](size_t x) { + vector_path_nearest_small(static_cast(x), dst); + }); + } + } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { + loop.unroll_four_times([&](size_t _x) { + uint32_t x = static_cast(_x); + ScalarType *p_dst = &dst[static_cast(_x)]; + uint32x4_t res0 = calculate_linear(x); + x += kStep; + uint32x4_t res1 = calculate_linear(x); + uint16x8_t result16_0 = vuzp1q_u16(res0, res1); + x += kStep; + res0 = calculate_linear(x); + x += kStep; + res1 = calculate_linear(x); + uint16x8_t result16_1 = vuzp1q_u16(res0, res1); + vst1q_u8(p_dst, vuzp1q_u8(result16_0, result16_1)); + }); + loop.unroll_once([&](size_t x) { + ScalarType *p_dst = &dst[static_cast(x)]; + uint32x4_t res = calculate_linear(static_cast(x)); + p_dst[0] = vgetq_lane_u32(res, 0); + p_dst[1] = vgetq_lane_u32(res, 1); + p_dst[2] = vgetq_lane_u32(res, 2); + p_dst[3] = vgetq_lane_u32(res, 3); + }); } else { - loop.unroll_once(small_vector_path); + static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || + Inter == KLEIDICV_INTERPOLATION_LINEAR, + ": Unknown interpolation type!"); } - } + }; - private: - static constexpr float first_few_x[] = {0.F, 1.F, 2.F, 3.F}; - Rows src_rows_; - size_t src_height_; - uint32x2_t v_src_stride_; - uint32x4_t vq_src_stride_; - CoordVector x0123_, v_xmax_, v_ymax_; -}; // end of class WarpPerspective + for (size_t y = y_begin; y < y_end; ++y) { + process_row(y, dst_rows.as_columns()); + ++dst_rows; + } +} // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) @@ -143,8 +239,8 @@ kleidicv_error_t warp_perspective_stripe( const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t y_begin, size_t y_end, const float transformation[9], - size_t channels, kleidicv_interpolation_type_t, kleidicv_border_type_t, - const T *) { + size_t channels, kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t, const T *) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTERS(transformation); @@ -153,35 +249,50 @@ kleidicv_error_t warp_perspective_stripe( // Calculating in float32_t will only be precise until 24 bits, and // multiplication can only be done with 32x32 bits - if (src_stride >= (1ULL << 32) || src_width >= (1ULL << 24) || - src_height >= (1ULL << 24)) { + if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || + dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || + src_stride >= (1ULL << 32)) { return KLEIDICV_ERROR_RANGE; } Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; - Rectangle rect{dst_width, dst_height}; dst_rows += y_begin; - if (KLEIDICV_LIKELY(src_stride * src_height < (1ULL << 32))) { - WarpPerspective operation{src_rows, src_width, src_height}; - for (size_t y = y_begin; y < y_end; ++y) { - operation.process_row(y, rect.width(), transformation, - dst_rows.as_columns()); - ++dst_rows; + if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { + switch (interpolation) { + case KLEIDICV_INTERPOLATION_NEAREST: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + case KLEIDICV_INTERPOLATION_LINEAR: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + default: + break; // GCOVR_EXCL_LINE } } else { - WarpPerspective operation{src_rows, src_width, src_height}; - for (size_t y = y_begin; y < y_end; ++y) { - operation.process_row(y, rect.width(), transformation, - dst_rows.as_columns()); - ++dst_rows; + switch (interpolation) { + case KLEIDICV_INTERPOLATION_NEAREST: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + case KLEIDICV_INTERPOLATION_LINEAR: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + default: + break; // GCOVR_EXCL_LINE } } - + // NOLINTEND(readability-function-cognitive-complexity) return KLEIDICV_OK; } -// NOLINTEND(readability-function-cognitive-complexity) #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type) \ template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \ @@ -195,4 +306,3 @@ kleidicv_error_t warp_perspective_stripe( KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t); } // namespace kleidicv::neon -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE diff --git a/kleidicv/src/transform/warp_perspective_sc.h b/kleidicv/src/transform/warp_perspective_sc.h index aaa358ec7..a790a6acf 100644 --- a/kleidicv/src/transform/warp_perspective_sc.h +++ b/kleidicv/src/transform/warp_perspective_sc.h @@ -11,8 +11,8 @@ #include "kleidicv/sve2.h" #include "kleidicv/traits.h" #include "kleidicv/transform/warp_perspective.h" +#include "kleidicv/types.h" -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE namespace KLEIDICV_TARGET_NAMESPACE { // Gather load is not available in streaming mode, and in general random access @@ -36,161 +36,248 @@ namespace KLEIDICV_TARGET_NAMESPACE { // xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8) // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) // -template -class WarpPerspective; - -template <> -class WarpPerspective { - public: - using ScalarType = uint8_t; - using DstVecTraits = VecTraits; - using CoordVecTraits = VecTraits; - using CoordVector = CoordVecTraits::VectorType; - - WarpPerspective(Rows src_rows, size_t src_width, - size_t src_height, - const float tr_[9]) KLEIDICV_STREAMING_COMPATIBLE - : src_rows_{src_rows}, - src_width_{src_width}, - src_height_{src_height}, - tr_{tr_} {} - - void process_row(size_t y, size_t width, Columns dst) { - svfloat32_t svzero = svdup_n_f32(0.F); - svfloat32_t sv_xmax = svdup_n_f32(static_cast(src_width_ - 1)); - svfloat32_t sv_ymax = svdup_n_f32(static_cast(src_height_ - 1)); - svuint32_t sv_src_stride = svdup_n_u32(src_rows_.stride()); - svbool_t pg_all64 = svptrue_b64(); - svbool_t pg_all32 = svptrue_b32(); - svbool_t pg_all8 = svptrue_b8(); - float fy = static_cast(y); - svfloat32_t vx0 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1)); - // (Nearest) integer part of transformed coordinates - svuint32_t xi, yi; - // Calculate half-transformed base values (nominators) at the first vector - // (x=0) - // tw = T6*x + T7*y + T8 - // tx = (T0*x + T1*y + T2) / tw - // ty = (T3*x + T4*y + T5) / tw - svfloat32_t tw0 = svdup_n_f32(tr_[7] * fy + tr_[8]); - svfloat32_t tx0 = svdup_n_f32(tr_[1] * fy + tr_[2]); - svfloat32_t ty0 = svdup_n_f32(tr_[4] * fy + tr_[5]); - auto calculate_coordinates = [&](size_t x) { - svfloat32_t vx = svadd_n_f32_x(pg_all32, vx0, static_cast(x)); - svfloat32_t tw = svmla_x(pg_all32, tw0, vx, tr_[6]); - svfloat32_t tx = svmla_x(pg_all32, tx0, vx, tr_[0]); - svfloat32_t ty = svmla_x(pg_all32, ty0, vx, tr_[3]); - - // Calculate inverse weight because division is expensive - svfloat32_t iw = svdiv_f32_x(pg_all32, svdup_n_f32(1.F), tw); - // Calc and clamp coordinates to within the dimensions of the source image - svfloat32_t dx = - svmax_x(pg_all32, svzero, - svmin_x(pg_all32, svmad_x(pg_all32, tx, iw, 0.5F), sv_xmax)); - svfloat32_t dy = - svmax_x(pg_all32, svzero, - svmin_x(pg_all32, svmad_x(pg_all32, ty, iw, 0.5F), sv_ymax)); - - xi = svcvt_u32_f32_x(pg_all32, dx); - yi = svcvt_u32_f32_x(pg_all32, dy); - }; - auto calculate_and_load_64bits = [&](svbool_t pg_b, svbool_t pg_t, - size_t x) { - calculate_coordinates(x); - svuint64_t indices_b = svmlalb(svmovlb(xi), yi, sv_src_stride); - svuint64_t indices_t = svmlalt(svmovlt(xi), yi, sv_src_stride); - // Copy pixels from source - svuint64_t result_b = - svld1ub_gather_offset_u64(pg_b, &src_rows_[0], indices_b); - svuint64_t result_t = - svld1ub_gather_offset_u64(pg_t, &src_rows_[0], indices_t); - return svtrn1_u32(svreinterpret_u32_u64(result_b), - svreinterpret_u32_u64(result_t)); - }; +template +void warp_perspective_operation(Rows src_rows, + size_t src_width, size_t src_height, + const float transform[9], + Rows dst_rows, size_t dst_width, + size_t y_begin, size_t y_end) { + svbool_t pg_all64 = svptrue_b64(); + svbool_t pg_all32 = svptrue_b32(); + svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1)); + svuint32_t sv_xmax = svdup_n_u32(src_width - 1); + svuint32_t sv_ymax = svdup_n_u32(src_height - 1); + svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); - auto calculate_and_load_32bits = [&](svbool_t pg, size_t x) { - calculate_coordinates(x); - svuint32_t indices = svmla_x(pg, xi, yi, sv_src_stride); - return svld1ub_gather_offset_u32(pg, &src_rows_[0], indices); - }; + svfloat32_t T0 = svdup_n_f32(transform[0]); + svfloat32_t T3 = svdup_n_f32(transform[3]); + svfloat32_t T6 = svdup_n_f32(transform[6]); + svfloat32_t tx0, ty0, tw0; + svfloat32_t xf, yf; + svfloat32_t xmaxf = svdup_n_f32(static_cast(src_width - 1)); + svfloat32_t ymaxf = svdup_n_f32(static_cast(src_height - 1)); + svuint32_t xi, yi; - auto vector_path_generic = [&](size_t x, size_t x_max) { - size_t length = x_max - x; - svbool_t pg_b = svwhilelt_b64(0ULL, (length + 1) / 2); - svbool_t pg_t = svwhilelt_b64(0ULL, length / 2); - svbool_t pg = svwhilelt_b32(0ULL, length); - // To avoid losing precision, the final indices use 64 bits - svuint32_t result = calculate_and_load_64bits(pg_b, pg_t, x); - svst1b_u32(pg, &dst[0], result); - dst += static_cast(length); - }; + svbool_t pg32 = pg_all32; + svbool_t pg64_b = pg_all64, pg64_t = pg_all64; + + const size_t kStep = VecTraits::num_lanes(); + + auto calculate_coordinates = [&](size_t x) { + svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast(x)); + // Calculate half-transformed values from the first few pixel values, + // plus Tn*x, similarly to the one above + // Calculate inverse weight because division is expensive + svfloat32_t iw = + svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6)); + svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0); + svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3); + + // Calculate coordinates into the source image + xf = svmul_f32_x(pg_all32, tx, iw); + yf = svmul_f32_x(pg_all32, ty, iw); + }; - auto vector_path_4x_large = [&](size_t x) { - // Copy pixels from source - // To avoid losing precision, the final indices use 64 bits - svuint32_t res32_0 = calculate_and_load_64bits(pg_all64, pg_all64, x); - x += CoordVecTraits::num_lanes(); - svuint32_t res32_1 = calculate_and_load_64bits(pg_all64, pg_all64, x); - svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - x += CoordVecTraits::num_lanes(); - res32_0 = calculate_and_load_64bits(pg_all64, pg_all64, x); - x += CoordVecTraits::num_lanes(); - res32_1 = calculate_and_load_64bits(pg_all64, pg_all64, x); - svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0), - svreinterpret_u8_u16(result1)); - svst1(pg_all8, &dst[0], result); - dst += static_cast(4 * CoordVecTraits::num_lanes()); + auto calculate_nearest_coordinates = [&](size_t x) { + calculate_coordinates(x); + // Round to the nearest integer, clamp it to within the dimensions of the + // source image (negative values are already saturated to 0) + xi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), + sv_xmax); + yi = svmin_x(pg_all32, + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), + sv_ymax); + }; + + auto load_small = [&](svbool_t pg, svuint32_t x, svuint32_t y) { + svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); + return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); + }; + + auto load_large = [&](svbool_t pg_b, svbool_t pg_t, svuint32_t x, + svuint32_t y) { + // Calculate offsets from coordinates (y * stride + x) + // To avoid losing precision, the final offsets should be in 64 bits + svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); + svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); + // Copy pixels from source + svuint64_t result_b = + svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); + svuint64_t result_t = + svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); + return svtrn1_u32(svreinterpret_u32_u64(result_b), + svreinterpret_u32_u64(result_t)); + }; + + auto vector_path_nearest_4x = [&](size_t x, Columns dst) { + auto load_source = [&](svuint32_t x, svuint32_t y) { + if constexpr (IsLarge) { + return load_large(pg_all64, pg_all64, x, y); + } else { + return load_small(pg_all32, x, y); + } }; + ScalarType *p_dst = &dst[static_cast(x)]; + calculate_nearest_coordinates(x); + svuint32_t res32_0 = load_source(xi, yi); + x += kStep; + calculate_nearest_coordinates(x); + svuint32_t res32_1 = load_source(xi, yi); + svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + x += kStep; + calculate_nearest_coordinates(x); + res32_0 = load_source(xi, yi); + x += kStep; + calculate_nearest_coordinates(x); + res32_1 = load_source(xi, yi); + svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), + svreinterpret_u16_u32(res32_1)); + svuint8_t result = + svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1)); + svst1(svptrue_b8(), p_dst, result); + }; + + auto vector_path_nearest_tail = [&](size_t x, size_t x_max, + Columns dst) { + size_t length = x_max - x; + svbool_t pg_b = svwhilelt_b64(0ULL, (length + 1) / 2); + svbool_t pg_t = svwhilelt_b64(0ULL, length / 2); + svbool_t pg = svwhilelt_b32(0ULL, length); + // To avoid losing precision, the final indices use 64 bits + calculate_nearest_coordinates(x); + svuint32_t result = load_large(pg_b, pg_t, xi, yi); + svst1b_u32(pg, &dst[static_cast(x)], result); + }; - auto vector_path_4x_small = [&](size_t x) { - // Copy pixels from source - // Use this path only when the final indices fit into 32 bits - svuint32_t res32_0 = calculate_and_load_32bits(pg_all32, x); - x += CoordVecTraits::num_lanes(); - svuint32_t res32_1 = calculate_and_load_32bits(pg_all32, x); - svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - x += CoordVecTraits::num_lanes(); - res32_0 = calculate_and_load_32bits(pg_all32, x); - x += CoordVecTraits::num_lanes(); - res32_1 = calculate_and_load_32bits(pg_all32, x); - svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), - svreinterpret_u16_u32(res32_1)); - svuint8_t result = svuzp1_u8(svreinterpret_u8_u16(result0), - svreinterpret_u8_u16(result1)); - svst1(pg_all8, &dst[0], result); - dst += static_cast(4 * CoordVecTraits::num_lanes()); + auto calculate_linear = [&](size_t x) { + auto load_source = [&](svuint32_t x, svuint32_t y) { + if constexpr (IsLarge) { + return load_large(pg64_b, pg64_t, x, y); + } else { + return load_small(pg32, x, y); + } }; - LoopUnroll2 loop{width, CoordVecTraits::num_lanes()}; - if (KLEIDICV_LIKELY(src_rows_.stride() * src_height_ < (1ULL << 32))) { - loop.unroll_four_times(vector_path_4x_small); + calculate_coordinates(x); + // Take the integer part, clamp it to within the dimensions of the + // source image (negative values are already saturated to 0) + svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf)); + svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf)); + + // Get fractional part, or 0 if out of range + svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F), + svcmplt_f32(pg_all32, xf, xmaxf)); + svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F), + svcmplt_f32(pg_all32, yf, ymaxf)); + svfloat32_t xfrac = svsel_f32( + x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)), + svdup_n_f32(0.F)); + svfloat32_t yfrac = svsel_f32( + y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)), + svdup_n_f32(0.F)); + + // x1 = x0 + 1, except if it's already xmax or out of range + svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0); + svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0); + + // Calculate offsets from coordinates (y * stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + svfloat32_t a = svcvt_f32_u32_x(pg_all32, load_source(x0, y0)); + svfloat32_t b = svcvt_f32_u32_x(pg_all32, load_source(x1, y0)); + svfloat32_t line0 = + svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); + svfloat32_t c = svcvt_f32_u32_x(pg_all32, load_source(x0, y1)); + svfloat32_t d = svcvt_f32_u32_x(pg_all32, load_source(x1, y1)); + svfloat32_t line1 = + svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); + svfloat32_t result = svmla_f32_x( + pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); + return svmin_u32_x( + pg_all32, svdup_n_u32(0xFF), + svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F))); + }; + + auto process_row = [&](size_t y, Columns dst) { + float fy = static_cast(y); + // Calculate half-transformed values at the first pixel (nominators) + // tw = T6*x + T7*y + T8 + // tx = (T0*x + T1*y + T2) / tw + // ty = (T3*x + T4*y + T5) / tw + tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2])); + ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5])); + tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8])); + + pg32 = pg_all32; + pg64_b = pg64_t = pg_all64; + + LoopUnroll2 loop{dst_width, kStep}; + if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { + loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); }); + loop.unroll_once( + [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); }); + loop.remaining([&](size_t x, size_t length) { + vector_path_nearest_tail(x, length, dst); + }); + } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { + loop.unroll_four_times([&](size_t x) { + ScalarType *p_dst = &dst[static_cast(x)]; + svuint32_t res0 = calculate_linear(x); + x += kStep; + svuint32_t res1 = calculate_linear(x); + svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + x += kStep; + res0 = calculate_linear(x); + x += kStep; + res1 = calculate_linear(x); + svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), + svreinterpret_u16_u32(res1)); + svst1_u8(svptrue_b8(), p_dst, + svuzp1_u8(svreinterpret_u8_u16(result16_0), + svreinterpret_u8_u16(result16_1))); + }); + loop.unroll_once([&](size_t x) { + ScalarType *p_dst = &dst[static_cast(x)]; + svuint32_t result = calculate_linear(x); + svst1b_u32(pg_all32, p_dst, result); + }); + loop.remaining([&](size_t x, size_t x_max) { + ScalarType *p_dst = &dst[static_cast(x)]; + size_t length = x_max - x; + pg32 = svwhilelt_b32(0ULL, length); + if constexpr (IsLarge) { + pg64_b = svwhilelt_b64(0ULL, (length + 1) / 2); + pg64_t = svwhilelt_b64(0ULL, length / 2); + } + svuint32_t result = calculate_linear(x); + svst1b_u32(pg32, p_dst, result); + }); } else { - loop.unroll_four_times(vector_path_4x_large); + static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || + Inter == KLEIDICV_INTERPOLATION_LINEAR, + ": Unknown interpolation type!"); } - loop.unroll_once([&](size_t x) { - vector_path_generic(x, x + CoordVecTraits::num_lanes()); - }); - loop.remaining(vector_path_generic); - } + }; - private: - Rows src_rows_; - size_t src_width_, src_height_; - const float *tr_; -}; // end of class WarpPerspective + for (size_t y = y_begin; y < y_end; ++y) { + process_row(y, dst_rows.as_columns()); + ++dst_rows; + } +} +// Most of the complexity comes from parameter checking. +// NOLINTBEGIN(readability-function-cognitive-complexity) template KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc( const T *src, size_t src_stride, size_t src_width, size_t src_height, T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, size_t y_begin, size_t y_end, const float transformation[9], - size_t channels, kleidicv_interpolation_type_t, kleidicv_border_type_t, - const T *) { + size_t channels, kleidicv_interpolation_type_t interpolation, + kleidicv_border_type_t, const T *) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_POINTERS(transformation); @@ -199,24 +286,53 @@ KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe_sc( // Calculating in float32_t will only be precise until 24 bits, and // multiplication can only be done with 32x32 bits - if (src_stride >= (1ULL << 32) || src_width >= (1ULL << 24) || - src_height >= (1ULL << 24)) { + if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || + dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || + src_stride >= (1ULL << 32)) { return KLEIDICV_ERROR_RANGE; } Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; - WarpPerspective operation{src_rows, src_width, src_height, transformation}; Rectangle rect{dst_width, dst_height}; dst_rows += y_begin; - for (size_t y = y_begin; y < y_end; ++y) { - operation.process_row(y, rect.width(), dst_rows.as_columns()); - ++dst_rows; + + if (KLEIDICV_UNLIKELY(src_rows.stride() * src_height >= (1ULL << 32))) { + switch (interpolation) { + case KLEIDICV_INTERPOLATION_NEAREST: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + case KLEIDICV_INTERPOLATION_LINEAR: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + default: + break; // GCOVR_EXCL_LINE + } + } else { + switch (interpolation) { + case KLEIDICV_INTERPOLATION_NEAREST: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + case KLEIDICV_INTERPOLATION_LINEAR: + warp_perspective_operation( + src_rows, src_width, src_height, transformation, dst_rows, + dst_width, y_begin, y_end); + break; + default: + break; // GCOVR_EXCL_LINE + } } return KLEIDICV_OK; } +// NOLINTEND(readability-function-cognitive-complexity) #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(type) \ template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \ @@ -232,4 +348,3 @@ KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE_SC(uint8_t); #endif } // namespace KLEIDICV_TARGET_NAMESPACE -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE diff --git a/kleidicv/src/transform/warp_perspective_sve2.cpp b/kleidicv/src/transform/warp_perspective_sve2.cpp index 8310053d3..5e3649050 100644 --- a/kleidicv/src/transform/warp_perspective_sve2.cpp +++ b/kleidicv/src/transform/warp_perspective_sve2.cpp @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "kleidicv/ctypes.h" #include "warp_perspective_sc.h" -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE namespace kleidicv::sve2 { template @@ -32,4 +32,3 @@ kleidicv_error_t warp_perspective_stripe( KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t); } // namespace kleidicv::sve2 -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 51189095f..0bd45c7b1 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -5,6 +5,7 @@ #ifndef KLEIDICV_THREAD_H #define KLEIDICV_THREAD_H +#include "kleidicv/ctypes.h" #include "kleidicv/kleidicv.h" #ifdef __cplusplus diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index b6fcecc6b..166da5d28 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -10,6 +10,7 @@ #include #include "kleidicv/arithmetics/rotate.h" +#include "kleidicv/ctypes.h" #include "kleidicv/filters/blur_and_downsample.h" #include "kleidicv/filters/gaussian_blur.h" #include "kleidicv/filters/scharr.h" @@ -697,7 +698,6 @@ kleidicv_error_t kleidicv_thread_remap_s16point5_u8( return parallel_batches(callback, mt, dst_height); } -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE kleidicv_error_t kleidicv_thread_warp_perspective_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, @@ -706,7 +706,7 @@ kleidicv_error_t kleidicv_thread_warp_perspective_u8( kleidicv_border_type_t border_type, const uint8_t *border_value, kleidicv_thread_multithreading mt) { if (!kleidicv::warp_perspective_is_implemented( - dst_width, interpolation, border_type, channels)) { + dst_width, channels, interpolation, border_type)) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } @@ -718,4 +718,3 @@ kleidicv_error_t kleidicv_thread_warp_perspective_u8( }; return parallel_batches(callback, mt, dst_height); } -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 48060a762..c3bde40f5 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -80,8 +80,10 @@ InRange_F32: opencv_perf_core '*inRangeScalar/*' '($PIXEL_FORMAT, 32FC1, 1, 2)' Remap_S16_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_NEAREST, BORDER_REPLICATE)' Remap_S16Point5_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' -WarpPerspective: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' -WarpPerspectiveNear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' +WarpPerspective_Nearest: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' +WarpPerspectiveNear_Nearest: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_NEAREST, BORDER_REPLICATE, 8UC1)' +WarpPerspective_Linear: opencv_perf_imgproc '*WarpPerspective/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)' +WarpPerspectiveNear_Linear: opencv_perf_imgproc '*WarpPerspectiveNear/*' '($PIXEL_FORMAT, INTER_LINEAR, BORDER_REPLICATE, 8UC1)' BlurAndDownsample: opencv_perf_imgproc '*pyrDown/*' '($PIXEL_FORMAT, 8UC1)' diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index e36bfbfe5..6f09b8855 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -9,6 +9,7 @@ #include "framework/array.h" #include "framework/generator.h" +#include "kleidicv/ctypes.h" #include "kleidicv/kleidicv.h" #include "kleidicv_thread/kleidicv_thread.h" #include "multithreading_fake.h" @@ -247,7 +248,9 @@ class Thread : public testing::TestWithParam

{ typename... Args> void check_warp_perspective(SingleThreadedFunc single_threaded_func, MultithreadedFunc multithreaded_func, - size_t channels, Args... args) { + size_t channels, + kleidicv_interpolation_type_t interpolation, + Args... args) { unsigned test_width = 0, height = 0, thread_count = 0; std::tie(test_width, height, thread_count) = GetParam(); const unsigned src_width = 300, src_height = 300; @@ -265,12 +268,13 @@ class Thread : public testing::TestWithParam

{ // clang-format on kleidicv_error_t single_result = single_threaded_func( src.data(), src.stride(), src_width, src_height, dst_single.data(), - dst_single.stride(), width, height, transform, channels, args...); + dst_single.stride(), width, height, transform, channels, interpolation, + args...); kleidicv_error_t multi_result = multithreaded_func( src.data(), src.stride(), src_width, src_height, dst_multi.data(), - dst_multi.stride(), width, height, transform, channels, args..., - get_multithreading_fake(thread_count)); + dst_multi.stride(), width, height, transform, channels, interpolation, + args..., get_multithreading_fake(thread_count)); EXPECT_EQ(KLEIDICV_OK, single_result); EXPECT_EQ(KLEIDICV_OK, multi_result); @@ -625,7 +629,6 @@ TEST_P(Thread, remap_s16point5_u8_not_implemented) { border_value); } -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE TEST_P(Thread, warp_perspective_u8_border_replicate) { const uint8_t border_value[4] = {}; check_warp_perspective(kleidicv_warp_perspective_u8, @@ -634,19 +637,29 @@ TEST_P(Thread, warp_perspective_u8_border_replicate) { KLEIDICV_BORDER_TYPE_REPLICATE, border_value); } +TEST_P(Thread, warp_perspective_u8_linear_border_replicate) { + const uint8_t border_value[4] = {}; + check_warp_perspective(kleidicv_warp_perspective_u8, + kleidicv_thread_warp_perspective_u8, 1, + KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, border_value); +} + TEST_P(Thread, warp_perspective_u8_not_implemented) { const uint8_t border_value[4] = {}; - check_warp_perspective_not_implemented( - kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR, - KLEIDICV_BORDER_TYPE_REPLICATE, border_value); check_warp_perspective_not_implemented( kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, border_value); check_warp_perspective_not_implemented( kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_CONSTANT, border_value); + check_warp_perspective_not_implemented( + kleidicv_thread_warp_perspective_u8, 2, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, border_value); + check_warp_perspective_not_implemented( + kleidicv_thread_warp_perspective_u8, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_CONSTANT, border_value); } -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE TEST_P(Thread, SobelHorizontal1Channel) { check_unary_op(kleidicv_sobel_3x3_horizontal_s16_u8, diff --git a/test/api/test_warp_perspective.cpp b/test/api/test_warp_perspective.cpp index 60c457f8f..b7b81ba38 100644 --- a/test/api/test_warp_perspective.cpp +++ b/test/api/test_warp_perspective.cpp @@ -4,13 +4,15 @@ #include +#include +#include + #include "framework/array.h" #include "framework/generator.h" #include "framework/utils.h" #include "kleidicv/ctypes.h" #include "kleidicv/kleidicv.h" - -#if KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE +#include "kleidicv/transform/warp_perspective.h" // clang-format off static const float transform_identity[] = { @@ -32,19 +34,34 @@ static const float transform_small[] = { }; // clang-format on +// Perspective calculation in float greatly amplifies any small errors coming +// from precision innaccuracies, let it be Fused-Multiply-Add or just the +// limitation of the single precision float. +// Taking the fractional part of a value in the thousands' range decreases +// the precision by 3 decimal digits, and the single precision float only has +// 7 decimal digits. Doing a linear interpolation between 0 and 255 would +// decrease it even more, but ensuring that neighbouring pixels have +// neighbouring values decreases this effect by more than 2 decimal digits so it +// can be expected that the error won't be bigger than 1. template -static void random_initializer(test::Array2D &source) { - test::PseudoRandomNumberGenerator generator; - source.fill(generator); +static void sequential_initializer(test::Array2D &source) { + for (int64_t y = 0; y < static_cast(source.height()); ++y) { + for (int64_t x = 0; x < static_cast(source.width()); ++x) { + const int64_t kMaxVal = std::numeric_limits::max(); + *source.at(y, x) = abs((x + y) % (2 * kMaxVal + 1) - kMaxVal); + } + } } template -class WarpPerspective : public testing::Test { +class WarpPerspectiveNearest : public testing::Test { public: - static void test( - size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, - const float transform[9], size_t channels, size_t padding, - void (*initializer)(test::Array2D &) = random_initializer) { + static void test_stripe(size_t src_w, size_t src_h, size_t dst_w, + size_t dst_h, size_t y_begin, size_t y_end, + const float transform[9], size_t channels, + size_t padding, + void (*initializer)(test::Array2D &) = + sequential_initializer) { size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -54,24 +71,61 @@ class WarpPerspective : public testing::Test { channels}; initializer(source); - actual.fill(42); - - calculate_expected(source, transform, expected); + for (size_t row = y_begin; row < y_end; ++row) { + for (size_t column = 0; column < actual.width(); ++column) { + *actual.at(row, column) = 42; + } + } - ASSERT_EQ(KLEIDICV_OK, kleidicv_warp_perspective_u8( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), transform, - channels, KLEIDICV_INTERPOLATION_NEAREST, - KLEIDICV_BORDER_TYPE_REPLICATE, {})); + calculate_expected(source, y_begin, y_end, transform, expected); + + ASSERT_EQ( + KLEIDICV_OK, + kleidicv_warp_perspective_stripe_u8( + source.data(), source.stride(), source.width(), source.height(), + actual.data(), actual.stride(), actual.width(), actual.height(), + y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_NEAREST, + KLEIDICV_BORDER_TYPE_REPLICATE, {})); + + ScalarType threshold = 1; + for (size_t row = y_begin; row < y_end; ++row) { + for (size_t column = 0; column < expected.width(); ++column) { + const ScalarType *lhs = expected.at(row, column); + const ScalarType *rhs = actual.at(row, column); + if (!lhs || !rhs) { + GTEST_FAIL() << "Nullptr at (row=" << row << ", column=" << column + << "): " << reinterpret_cast(lhs) << " , " + << reinterpret_cast(lhs) << std::endl; + } + ScalarType error; + if constexpr (std::is_integral::value) { + error = static_cast( + abs(static_cast(lhs[0]) - static_cast(rhs[0]))); + } else { + error = std::abs(lhs[0] - rhs[0]); + } + if (error > threshold) { + GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << column + << "): " << +(expected).at(row, column)[0] << " vs " + << +(actual).at(row, column)[0] << "." << std::endl; + } + } + } + } - EXPECT_EQ_ARRAY2D(actual, expected); + static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, + const float transform[9], size_t channels, size_t padding, + void (*initializer)(test::Array2D &) = + sequential_initializer) { + test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels, + padding, initializer); } static void test_special_source( size_t src_w, size_t src_h, size_t src_stride, size_t dst_w, size_t dst_h, const float transform[9], size_t channels, - void (*initializer)(test::Array2D &) = random_initializer) { + void (*initializer)(test::Array2D &) = + sequential_initializer) { size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -88,7 +142,7 @@ class WarpPerspective : public testing::Test { actual.fill(42); - calculate_expected(source, transform, expected); + calculate_expected(source, 0, expected.height(), transform, expected); ASSERT_EQ(KLEIDICV_OK, kleidicv_warp_perspective_u8( @@ -97,14 +151,14 @@ class WarpPerspective : public testing::Test { channels, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, {})); - EXPECT_EQ_ARRAY2D(actual, expected); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } private: - static void calculate_expected(test::Array2D &src, - const float transform[9], + static void calculate_expected(test::Array2D &src, size_t y_begin, + size_t y_end, const float transform[9], test::Array2D &expected) { - for (size_t y = 0; y < expected.height(); ++y) { + for (size_t y = y_begin; y < y_end; ++y) { for (size_t x = 0; x < expected.width() / src.channels(); ++x) { float dx = static_cast(x), dy = static_cast(y); @@ -128,9 +182,9 @@ class WarpPerspective : public testing::Test { }; using WarpPerspectiveElementTypes = ::testing::Types; -TYPED_TEST_SUITE(WarpPerspective, WarpPerspectiveElementTypes); +TYPED_TEST_SUITE(WarpPerspectiveNearest, WarpPerspectiveElementTypes); -TYPED_TEST(WarpPerspective, IdentityNoPadding) { +TYPED_TEST(WarpPerspectiveNearest, IdentityNoPadding) { size_t src_w = 3 * test::Options::vector_lanes() - 1; size_t src_h = 4; size_t dst_w = src_w; @@ -138,7 +192,7 @@ TYPED_TEST(WarpPerspective, IdentityNoPadding) { TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0); } -TYPED_TEST(WarpPerspective, TransposeNoPadding) { +TYPED_TEST(WarpPerspectiveNearest, TransposeNoPadding) { size_t src_w = 3 * test::Options::vector_lanes() - 1; size_t src_h = 4; size_t dst_w = src_w; @@ -146,7 +200,7 @@ TYPED_TEST(WarpPerspective, TransposeNoPadding) { TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0); } -TYPED_TEST(WarpPerspective, SmallPadding) { +TYPED_TEST(WarpPerspectiveNearest, SmallPadding) { size_t src_w = 3 * test::Options::vector_lanes() - 1; size_t src_h = 4; size_t dst_w = src_w; @@ -154,7 +208,7 @@ TYPED_TEST(WarpPerspective, SmallPadding) { TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13); } -TYPED_TEST(WarpPerspective, Upscale) { +TYPED_TEST(WarpPerspectiveNearest, Upscale) { // clang-format off const float transform_upscale[] = { 0.2, 0.05, 0.1, @@ -170,7 +224,7 @@ TYPED_TEST(WarpPerspective, Upscale) { TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); } -TYPED_TEST(WarpPerspective, RandomTransform) { +TYPED_TEST(WarpPerspectiveNearest, RandomTransform) { float transform[9]; test::PseudoRandomNumberGenerator generator; for (size_t i = 0; i < 9; ++i) { @@ -184,7 +238,7 @@ TYPED_TEST(WarpPerspective, RandomTransform) { TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19); } -TYPED_TEST(WarpPerspective, DivisionByZero) { +TYPED_TEST(WarpPerspectiveNearest, DivisionByZero) { // clang-format off const float transform_div_by_zero[] = { 1, 0, 0, @@ -200,38 +254,100 @@ TYPED_TEST(WarpPerspective, DivisionByZero) { TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3); } -static const size_t big_width = 1ULL << 17, big_height = 1ULL << 17; -static const size_t part_width = 16, part_height = 16; +static const size_t kBigWidth = 1ULL << 17, kBigHeight = 1ULL << 17; +static const size_t kPartWidth = 21, kPartHeight = 21; template static void part_initializer(test::Array2D &source) { ScalarType counter = 0; - for (size_t y = big_height; y < big_height + part_height; ++y) { - for (size_t x = big_width; x < big_width + part_width; ++x) { + for (size_t y = kBigHeight; y < kBigHeight + kPartHeight; ++y) { + for (size_t x = kBigWidth; x < kBigWidth + kPartWidth; ++x) { *source.at(y, x) = ++counter; } } } -TYPED_TEST(WarpPerspective, BigSourceImage) { +TYPED_TEST(WarpPerspectiveNearest, BigSourceImage) { // clang-format off const float transform_cut[] = { - 1, 0, 1ULL<<17, - 0, 1, 1ULL<<17, + 1, 0, kBigWidth, + 0, 1, kBigHeight, 0, 0, 1 }; // clang-format on // Let stride * height be bigger than 1 << 32 - size_t dst_w = part_width; - size_t dst_h = part_height; - size_t src_w = big_width + part_width; - size_t src_h = big_height + part_height; + size_t dst_w = kPartWidth; + size_t dst_h = kPartHeight; + size_t src_w = kBigWidth + kPartWidth; + size_t src_h = kBigHeight + kPartHeight; TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0, part_initializer); } +TYPED_TEST(WarpPerspectiveNearest, BigWidthDestination) { + // clang-format off + const float transform_upscale[] = { + 0.0011, 0, 0.3, + 0, 1.02, -0.1, + 0, 0, 1 + }; + // clang-format on + + size_t dst_w = kBigWidth; + size_t dst_h = 1; + size_t src_w = dst_w / 1000; + size_t src_h = dst_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); +} + +static const size_t kHugeHeight = (1ULL << 24) - 1; +TYPED_TEST(WarpPerspectiveNearest, BigHeightDestination) { + // clang-format off + const float transform_upscale[] = { + 1.03, 0, 0.1, + 0, 0.00011, -0.3, + 0, 0, 1 + }; + // clang-format on + + size_t dst_w = 17; + size_t dst_h = kHugeHeight; + size_t src_w = dst_w; + size_t src_h = dst_h / 10000; + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, + dst_h, transform_upscale, 1, 0); +} + +template +static void huge_height_part_initializer(test::Array2D &source) { + ScalarType counter = 0; + for (size_t y = kHugeHeight - kPartHeight; y < kHugeHeight; ++y) { + for (size_t x = 0; x < 17; ++x) { + *source.at(y, x) = ++counter; + } + } +} + +TYPED_TEST(WarpPerspectiveNearest, HugeHeightSourceAndDestination) { + // clang-format off + const float transform[] = { + 1.03, 0, 0.1, + 0, 0.9993, -0.3, + 0, 0, 1 + }; + // clang-format on + + size_t dst_w = 17; + size_t dst_h = (1ULL << 24) - 1; + size_t src_w = dst_w; + size_t src_h = dst_h; + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - 16, dst_h, + transform, 1, 0, + huge_height_part_initializer); +} + static const size_t oneline_big_width = 1ULL << 23; static const size_t oneline_part_width = 16, oneline_part_offset = 1ULL << 17, oneline_dst_height = 16; @@ -245,7 +361,7 @@ static void oneline_part_initializer(test::Array2D &source) { } } -TYPED_TEST(WarpPerspective, OneLineBigSourceImage) { +TYPED_TEST(WarpPerspectiveNearest, OneLineBigSourceImage) { // clang-format off const float tr[] = { 1, 0, oneline_part_offset, @@ -266,7 +382,7 @@ TYPED_TEST(WarpPerspective, OneLineBigSourceImage) { tr, 1, oneline_part_initializer); } -TYPED_TEST(WarpPerspective, OneLineSmallSourceImage) { +TYPED_TEST(WarpPerspectiveNearest, OneLineSmallSourceImage) { // clang-format off const float tr[] = { -1, 0, 1, @@ -283,7 +399,7 @@ TYPED_TEST(WarpPerspective, OneLineSmallSourceImage) { TestFixture::test_special_source(src_w, src_h, src_h, dst_w, dst_h, tr, 1); } -TYPED_TEST(WarpPerspective, NullPointer) { +TYPED_TEST(WarpPerspectiveNearest, NullPointer) { const TypeParam src[4] = {}; TypeParam dst[8]; test::test_null_args(kleidicv_warp_perspective_u8, src, 2, 2, 2, dst, 8, 8, 1, @@ -291,7 +407,7 @@ TYPED_TEST(WarpPerspective, NullPointer) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } -TYPED_TEST(WarpPerspective, ZeroImageSize) { +TYPED_TEST(WarpPerspectiveNearest, ZeroImageSize) { const TypeParam src[1] = {}; TypeParam dst[1]; @@ -307,7 +423,7 @@ TYPED_TEST(WarpPerspective, ZeroImageSize) { nullptr)); } -TYPED_TEST(WarpPerspective, InvalidImageSize) { +TYPED_TEST(WarpPerspectiveNearest, InvalidImageSize) { const TypeParam src[1] = {}; TypeParam dst[8]; @@ -338,7 +454,7 @@ TYPED_TEST(WarpPerspective, InvalidImageSize) { nullptr)); } -TYPED_TEST(WarpPerspective, UnsupportedTwoChannels) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedTwoChannels) { const TypeParam src[1] = {}; TypeParam dst[8]; @@ -349,7 +465,7 @@ TYPED_TEST(WarpPerspective, UnsupportedTwoChannels) { nullptr)); } -TYPED_TEST(WarpPerspective, UnsupportedBorderTypeConst) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBorderTypeConst) { const TypeParam src[1] = {}; TypeParam dst[8]; @@ -360,7 +476,7 @@ TYPED_TEST(WarpPerspective, UnsupportedBorderTypeConst) { KLEIDICV_BORDER_TYPE_CONSTANT, src)); } -TYPED_TEST(WarpPerspective, UnsupportedTooSmallImage) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedTooSmallImage) { const TypeParam src[1] = {}; TypeParam dst[8]; @@ -371,47 +487,390 @@ TYPED_TEST(WarpPerspective, UnsupportedTooSmallImage) { nullptr)); } -TYPED_TEST(WarpPerspective, UnsupportedInterpolation) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigStride) { const TypeParam src[1] = {}; TypeParam dst[8]; - EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + EXPECT_EQ(KLEIDICV_ERROR_RANGE, kleidicv_warp_perspective_u8( - src, 1, 1, 1, dst, 8, 8, 1, transform_identity, 1, - KLEIDICV_INTERPOLATION_LINEAR, KLEIDICV_BORDER_TYPE_REPLICATE, + src, 1UL << 32, 1, 1, dst, 8, 8, 1, transform_identity, 1, + KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(WarpPerspective, UnsupportedBigStride) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigSourceHeight) { const TypeParam src[1] = {}; TypeParam dst[8]; EXPECT_EQ(KLEIDICV_ERROR_RANGE, kleidicv_warp_perspective_u8( - src, 1UL << 32, 1, 1, dst, 8, 8, 1, transform_identity, 1, + src, 1, 1, 1UL << 24, dst, 8, 8, 1, transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(WarpPerspective, UnsupportedBigHeight) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigSourceWidth) { const TypeParam src[1] = {}; TypeParam dst[8]; EXPECT_EQ(KLEIDICV_ERROR_RANGE, kleidicv_warp_perspective_u8( - src, 1, 1, 1UL << 24, dst, 8, 8, 1, transform_identity, 1, + src, 1, 1UL << 24, 1, dst, 8, 8, 1, transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(WarpPerspective, UnsupportedBigWidth) { +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigDestinationHeight) { const TypeParam src[1] = {}; TypeParam dst[8]; EXPECT_EQ(KLEIDICV_ERROR_RANGE, kleidicv_warp_perspective_u8( - src, 1, 1UL << 24, 1, dst, 8, 8, 1, transform_identity, 1, + src, 1, 1, 1, dst, 8, 8, 1UL << 24, transform_identity, 1, KLEIDICV_INTERPOLATION_NEAREST, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -#endif // KLEIDICV_EXPERIMENTAL_FEATURE_WARP_PERSPECTIVE + +TYPED_TEST(WarpPerspectiveNearest, UnsupportedBigDestinationWidth) { + const TypeParam src[1] = {}; + TypeParam dst[8]; + + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + kleidicv_warp_perspective_u8( + src, 1, 1, 1, dst, 1UL << 24, 1UL << 24, 1, transform_identity, + 1, KLEIDICV_INTERPOLATION_NEAREST, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); +} + +///////////////////////////////////////////////////////////// +// WarpPerspective with Linear Interpolation +///////////////////////////////////////////////////////////// + +template +class WarpPerspectiveLinear : public testing::Test { + public: + static void test_stripe(size_t src_w, size_t src_h, size_t dst_w, + size_t dst_h, size_t y_begin, size_t y_end, + const float transform[9], size_t channels, + size_t padding, + void (*initializer)(test::Array2D &) = + sequential_initializer) { + size_t src_total_width = channels * src_w; + size_t dst_total_width = channels * dst_w; + + test::Array2D source{src_total_width, src_h, padding, channels}; + test::Array2D actual{dst_total_width, dst_h, padding, channels}; + test::Array2D expected{dst_total_width, dst_h, padding, + channels}; + + initializer(source); + for (size_t row = y_begin; row < y_end; ++row) { + for (size_t column = 0; column < actual.width(); ++column) { + *actual.at(row, column) = 42; + } + } + + calculate_expected(source, y_begin, y_end, transform, expected); + + ASSERT_EQ( + KLEIDICV_OK, + kleidicv_warp_perspective_stripe_u8( + source.data(), source.stride(), source.width(), source.height(), + actual.data(), actual.stride(), actual.width(), actual.height(), + y_begin, y_end, transform, channels, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, {})); + + ScalarType threshold = 1; + for (size_t row = y_begin; row < y_end; ++row) { + for (size_t column = 0; column < expected.width(); ++column) { + const ScalarType *lhs = expected.at(row, column); + const ScalarType *rhs = actual.at(row, column); + if (!lhs || !rhs) { + GTEST_FAIL() << "Nullptr at (row=" << row << ", column=" << column + << "): " << reinterpret_cast(lhs) << " , " + << reinterpret_cast(lhs) << std::endl; + } + ScalarType error; + if constexpr (std::is_integral::value) { + error = static_cast( + abs(static_cast(lhs[0]) - static_cast(rhs[0]))); + } else { + error = std::abs(lhs[0] - rhs[0]); + } + if (error > threshold) { + GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << column + << "): " << +(expected).at(row, column)[0] << " vs " + << +(actual).at(row, column)[0] << "." << std::endl; + } + } + } + } + + static void test(size_t src_w, size_t src_h, size_t dst_w, size_t dst_h, + const float transform[9], size_t channels, size_t padding, + void (*initializer)(test::Array2D &) = + sequential_initializer) { + test_stripe(src_w, src_h, dst_w, dst_h, 0, dst_h, transform, channels, + padding, initializer); + } + + private: + static void calculate_expected(test::Array2D &src, size_t y_begin, + size_t y_end, const float transform[9], + test::Array2D &expected) { + for (size_t y = y_begin; y < y_end; ++y) { + for (size_t x = 0; x < expected.width() / src.channels(); ++x) { + double dx = static_cast(x), dy = static_cast(y); + double tw = transform[6] * dx + transform[7] * dy + transform[8]; + double inv_w = 1.F / tw; + double tx = transform[0] * dx + transform[1] * dy + transform[2]; + double ty = transform[3] * dx + transform[4] * dy + transform[5]; + double fx = inv_w * tx; + double fy = inv_w * ty; + uint32_t ix0 = static_cast( + std::max(0.0, std::min(fx, src.width() - 1.0))); + uint32_t iy0 = static_cast( + std::max(0.0, std::min(fy, src.height() - 1.0))); + uint32_t ix1 = (ix0 < src.width() - 1 && fx >= 0) ? ix0 + 1 : ix0; + uint32_t iy1 = (iy0 < src.height() - 1 && fy >= 0) ? iy0 + 1 : iy0; + double xfrac = + (ix0 < src.width() - 1 && fx >= 0) ? fx - floor(fx) : 0.0; + double yfrac = + (iy0 < src.height() - 1 && fy >= 0) ? fy - floor(fy) : 0.0; + for (size_t ch = 0; ch < src.channels(); ++ch) { + double a = *src.at(iy0, ix0 * src.channels() + ch); + double b = *src.at(iy0, ix1 * src.channels() + ch); + double c = *src.at(iy1, ix0 * src.channels() + ch); + double d = *src.at(iy1, ix1 * src.channels() + ch); + double line1 = (b - a) * xfrac + a; + double line2 = (d - c) * xfrac + c; + double double_result = (line2 - line1) * yfrac + line1; + *expected.at(y, x * src.channels() + ch) = + static_cast(lround(double_result)); + } + } + } + } +}; + +TYPED_TEST_SUITE(WarpPerspectiveLinear, WarpPerspectiveElementTypes); + +TYPED_TEST(WarpPerspectiveLinear, SimpleResize) { + const size_t src_w = 16; + const size_t src_h = 4; + const size_t dst_w = 17; + const size_t dst_h = 5; + const size_t padding = 1; + + test::Array2D source(src_w, src_h, padding, 1); + source.fill(0); + test::Array2D actual(dst_w, dst_h, padding, 1); + actual.fill(0); + test::Array2D expected(dst_w, dst_h, padding, 1); + expected.fill(0); + + // clang-format off + source.set(0, 0, { 0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 100 }); + source.set(1, 0, { 0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 100}); + source.set(2, 0, { 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100}); + + // first line: y = -0.3, so only the first line counts. x = [0, 0.9, 1.8, 2.7, 3.6, ...] + expected.set(0, 0, { 0, 90, 20, 70, 40, 50, 60, 30, 80, 10, 100, 10, 80, 30, 60, 50, 40 }); + // y = 1.2, x = [-0.1, 0.8, 1.7, 2.6, 3.5, ...] + expected.set(1, 0, { 20, 84, 44, 68, 60, 52, 76, 36, 92, 20, 92, 36, 76, 52, 60, 68, 44 }); + // y = 2.7, x = [-0.2, 0.7, 1.6, 2.5, ...] + expected.set(2, 0, { 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30 }); + // y = 4.2 and 5.7: all zeroes, x does not count + + const float little_scale[] = { + 0.9, -0.1, 0, + 0, 1.5, -0.3, + 0, 0, 1 + }; + // clang-format on + + EXPECT_EQ(KLEIDICV_OK, + kleidicv_warp_perspective_u8( + source.data(), source.stride(), source.width(), source.height(), + actual.data(), actual.stride(), actual.width(), actual.height(), + little_scale, 1, KLEIDICV_INTERPOLATION_LINEAR, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + + EXPECT_EQ_ARRAY2D(actual, expected); +} + +TYPED_TEST(WarpPerspectiveLinear, IdentityNoPadding) { + size_t src_w = 3 * test::Options::vector_lanes() - 1; + size_t src_h = 4; + size_t dst_w = src_w; + size_t dst_h = src_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_identity, 1, 0); +} + +TYPED_TEST(WarpPerspectiveLinear, TransposeNoPadding) { + size_t src_w = 3 * test::Options::vector_lanes() - 1; + size_t src_h = 4; + size_t dst_w = src_w; + size_t dst_h = src_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_transpose, 1, 0); +} + +TYPED_TEST(WarpPerspectiveLinear, SmallPadding) { + size_t src_w = 5 * test::Options::vector_lanes() - 1; + size_t src_h = 4; + size_t dst_w = src_w; + size_t dst_h = src_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_small, 1, 13); +} + +TYPED_TEST(WarpPerspectiveLinear, Upscale) { + // clang-format off + const float transform_upscale[] = { + 0.2, 0.05, 0.1, + 0.02, 0.2, -0.1, + 0.0001, 0.0001, 1.1 + }; + // clang-format on + + size_t src_w = 3 * test::Options::vector_lanes() - 1; + size_t src_h = 4; + size_t dst_w = src_w * 3; + size_t dst_h = src_h * 2; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); +} + +TYPED_TEST(WarpPerspectiveLinear, RandomTransform) { + float transform[9]; + // Not entirely random, as very small and very big floats (in absolute value) + // cause too big errors and they are far from being valid use cases anyway + test::PseudoRandomNumberGeneratorIntRange exponentGenerator(-10, 10); + test::PseudoRandomNumberGeneratorFloatRange mantissaGenerator(-1.0, + 1.0); + for (size_t cc = 0; cc < 100; ++cc) { + for (size_t i = 0; i < 9; ++i) { + transform[i] = + mantissaGenerator.next().value_or(1.0) * + static_cast( + exp(static_cast(exponentGenerator.next().value_or(1.0)))); + } + + size_t src_w = 3 * test::Options::vector_lanes() - 1; + size_t src_h = 4; + size_t dst_w = src_w; + size_t dst_h = src_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform, 1, 19); + } +} + +TYPED_TEST(WarpPerspectiveLinear, DivisionByZero) { + // clang-format off + const float transform_div_by_zero[] = { + 1, 0, 0, + 0, 1, 0, + 1, 1, -3 + }; + // clang-format on + + size_t src_w = 3 * test::Options::vector_lanes() - 1; + size_t src_h = 4; + size_t dst_w = src_w; + size_t dst_h = src_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_div_by_zero, 1, 3); +} + +template +static void crisscross_initializer(test::Array2D &source) { + for (int64_t y = 0; y < static_cast(source.height()); ++y) { + for (int64_t x = 0; x < static_cast(source.width()); ++x) { + *source.at(y, x) = + ((x % 10 == 0 && x > 0) || (y % 10 == 0 && y > 0)) ? 0 : 255; + } + } +} + +TYPED_TEST(WarpPerspectiveLinear, RoughSource) { + // clang-format off + const float transformation[] = { + -0.14721069682646492627, 0.0039219946625178928046, -0.23197875743643098234, + -9.6823083601201584969, 0.27678993497443726834, -24.722070405199925602, + 0.043781423781792741523, 0.0021065152608927420821, -0.5757716849053092778 + }; + // clang-format on + + size_t src_w = 13; + size_t src_h = 634; + size_t dst_w = 17; + size_t dst_h = 852; + TestFixture::test(src_w, src_h, dst_w, dst_h, transformation, 1, 13, + crisscross_initializer); +} + +TYPED_TEST(WarpPerspectiveLinear, BigSourceImage) { + // clang-format off + const float transform_cut[] = { + 1, 0, 1ULL<<17, + 0, 1, 1ULL<<17, + 0, 0, 1 + }; + // clang-format on + + // Let stride * height be bigger than 1 << 32 + size_t dst_w = kPartWidth; + size_t dst_h = kPartHeight; + size_t src_w = kBigWidth + kPartWidth + 3; + size_t src_h = kBigHeight + kPartHeight; + + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_cut, 1, 0, + part_initializer); +} + +TYPED_TEST(WarpPerspectiveLinear, BigWidthDestination) { + // clang-format off + const float transform_upscale[] = { + 0.0011, 0, 0.3, + 0, 1.02, -0.1, + 0, 0, 1 + }; + // clang-format on + + size_t dst_w = kBigWidth; + size_t dst_h = 1; + size_t src_w = dst_w / 1000; + size_t src_h = dst_h; + TestFixture::test(src_w, src_h, dst_w, dst_h, transform_upscale, 1, 3); +} + +TYPED_TEST(WarpPerspectiveLinear, BigHeightDestination) { + // clang-format off + const float transform_upscale[] = { + 1.03, 0, 0.1, + 0, 0.00011, -0.3, + 0, 0, 1 + }; + // clang-format on + + size_t dst_w = 17; + size_t dst_h = kHugeHeight; + size_t src_w = dst_w; + size_t src_h = dst_h / 10000; + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, + dst_h, transform_upscale, 1, 0); +} + +TYPED_TEST(WarpPerspectiveLinear, HugeHeightSourceAndDestination) { + // clang-format off + const float transform[] = { + 1.03, 0, 0.1, + 0, 0.9993, -0.3, + 0, 0, 1 + }; + // clang-format on + + size_t dst_w = 17; + size_t dst_h = (1ULL << 24) - 1; + size_t src_w = dst_w; + size_t src_h = dst_h; + TestFixture::test_stripe(src_w, src_h, dst_w, dst_h, dst_h - kPartHeight, + dst_h, transform, 1, 0, + huge_height_part_initializer); +} diff --git a/test/framework/array.h b/test/framework/array.h index 60bff94ac..cabdd9f4a 100644 --- a/test/framework/array.h +++ b/test/framework/array.h @@ -197,20 +197,29 @@ class Array2D : public TwoDimensional { } } - // Compares two instances for equality considering only element bytes. - // Returns the location of the first mismatch, if any. + // Compares two instances for (near) equality considering its content + // (elements). Returns the location of the first mismatch, if any. std::optional> compare_to( - const Array2D &other) const { + const Array2D &other, ElementType threshold = 0) const { for (size_t row = 0; row < height(); ++row) { for (size_t column = 0; column < width(); ++column) { const ElementType *lhs = at(row, column); const ElementType *rhs = other.at(row, column); - if (!lhs || !rhs || (lhs[0] != rhs[0])) { + if (!lhs || !rhs) { + return std::make_tuple(row, column); + } + ElementType error; + if constexpr (std::is_integral::value) { + error = static_cast( + abs(static_cast(lhs[0]) - static_cast(rhs[0]))); + } else { + error = std::abs(lhs[0] - rhs[0]); + } + if (error > threshold) { return std::make_tuple(row, column); } } } - return std::nullopt; } @@ -235,8 +244,8 @@ class Array2D : public TwoDimensional { // Returns true if this object holds actual memory, otherwise false. bool valid() const { return data() != nullptr; } - // Returns a pointer to a data element at a given row and column position, or - // nullptr if the requested position is invalid. + // Returns a pointer to a data element at a given row and column position, + // or nullptr if the requested position is invalid. ElementType *at(size_t row, size_t column) override { return const_cast( const_cast *>(this)->at(row, column)); @@ -255,8 +264,8 @@ class Array2D : public TwoDimensional { } private: - // Returns the number of elements between the end of one row and the start of - // the next row. + // Returns the number of elements between the end of one row and the start + // of the next row. size_t padding() { return stride() / sizeof(ElementType) - width(); } // Returns the offset to the first padding byte within a row. @@ -336,9 +345,9 @@ class Array2D : public TwoDimensional { buffer_ = std::unique_ptr(new ElementType[allocation_count]); // Weaken alignment to flush out potential alignment issues. - // buffer_.get() will contain a pointer that is at least 16-byte aligned. - // By adding a small offset to that we get a pointer that is only aligned - // to sizeof(ElementType). + // buffer_.get() will contain a pointer that is at least 16-byte + // aligned. By adding a small offset to that we get a pointer that is + // only aligned to sizeof(ElementType). data_ = buffer_.get() + 1; } catch (...) { reset(); @@ -352,8 +361,8 @@ class Array2D : public TwoDimensional { // Smart pointer to the managed memory. std::unique_ptr buffer_; - // Pointer to the start of the data. This is offset from the start of buffer_ - // to flush out potential alignment issues. + // Pointer to the start of the data. This is offset from the start of + // buffer_ to flush out potential alignment issues. ElementType *data_{nullptr}; // Width a row in the array. size_t width_{0}; @@ -365,9 +374,9 @@ class Array2D : public TwoDimensional { size_t stride_{0}; }; // end of class Array2D -// Compares two Array2D objects for equality. +// Compares two Array2D objects for (near-)equality. // Unary + is used to ensure values are printed as integers, not chars -#define EXPECT_EQ_ARRAY2D(lhs, rhs) \ +#define EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(threshold, lhs, rhs) \ do { \ ASSERT_EQ((lhs).width(), (rhs).width()) \ << "Mismatch in width." << std::endl; \ @@ -375,7 +384,7 @@ class Array2D : public TwoDimensional { << "Mismatch in height." << std::endl; \ ASSERT_EQ((lhs).channels(), (rhs).channels()) \ << "Mismatch in channels." << std::endl; \ - auto mismatch = (lhs).compare_to((rhs)); \ + auto mismatch = (lhs).compare_to((rhs), (threshold)); \ if (mismatch) { \ auto [row, col] = *mismatch; \ GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << col \ @@ -385,6 +394,10 @@ class Array2D : public TwoDimensional { } \ } while (0 != 0) +// Compares two Array2D objects for equality. +#define EXPECT_EQ_ARRAY2D(lhs, rhs) \ + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(0, (lhs), (rhs)) + // Compares two Array2D objects for inequality. #define EXPECT_NE_ARRAY2D(lhs, rhs) \ do { \ -- GitLab