From 14095cf8e1b60faf226c75f662297f9ecb7c8166 Mon Sep 17 00:00:00 2001 From: Mark Horvath Date: Thu, 7 Nov 2024 09:22:54 +0000 Subject: [PATCH 1/2] Add NEON version of scharr_interleaved_s16_u8 --- adapters/opencv/kleidicv_hal.cpp | 13 + adapters/opencv/kleidicv_hal.h | 24 ++ adapters/opencv/opencv-4.10.patch | 117 ++++++++- conformity/opencv/CMakeLists.txt | 4 + conformity/opencv/common.h | 1 + conformity/opencv/test_scharr.cpp | 48 ++++ conformity/opencv/tests.cpp | 1 + conformity/opencv/tests.h | 1 + doc/opencv.md | 3 + kleidicv/include/kleidicv/filters/scharr.h | 45 ++++ kleidicv/include/kleidicv/kleidicv.h | 55 +++++ kleidicv/src/filters/scharr_api.cpp | 30 +++ kleidicv/src/filters/scharr_neon.cpp | 228 ++++++++++++++++++ .../include/kleidicv_thread/kleidicv_thread.h | 9 + kleidicv_thread/src/kleidicv_thread.cpp | 20 ++ scripts/benchmark/benchmarks.txt | 2 + scripts/ci-opencv.sh | 12 +- scripts/run_opencv_conformity_checks.sh | 2 +- test/api/test_scharr.cpp | 191 +++++++++++++++ test/api/test_thread.cpp | 43 ++++ 20 files changed, 839 insertions(+), 10 deletions(-) create mode 100644 conformity/opencv/test_scharr.cpp create mode 100644 kleidicv/include/kleidicv/filters/scharr.h create mode 100644 kleidicv/src/filters/scharr_api.cpp create mode 100644 kleidicv/src/filters/scharr_neon.cpp create mode 100644 test/api/test_scharr.cpp diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index da456cc2d..99b2b10af 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1341,4 +1341,17 @@ int pyrdown(const uchar *src_data, size_t src_step, int src_width, return convert_error(blur_err ? blur_err : release_err); } +int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data, + size_t dst_step, int width, int height, int cn) { + // OpenCV provides the source pointer in a way that out-of-bounds reads are + // possible to handle borders. On the other hand, KleidiCV expects that the + // source pointer points to the top left pixel to be read by the algorithm. + const uint8_t *src = + reinterpret_cast(src_data - src_step) - cn; + + auto mt = get_multithreading(); + return convert_error(kleidicv_thread_scharr_interleaved_s16_u8( + src, src_step, width + 2, height + 2, cn, dst_data, dst_step, mt)); +} + } // namespace kleidicv::hal diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index 70f92d20f..2d34f27f9 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -152,6 +152,9 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, const int16_t *mapxy, size_t mapxy_step, const uint16_t *mapfrac, size_t mapfrac_step, int border_type, const double border_value[4]); + +int scharr_deriv(const uchar *src_data, size_t src_step, int16_t *dst_data, + size_t dst_step, int width, int height, int cn); } // namespace hal } // namespace kleidicv @@ -572,6 +575,27 @@ static inline int kleidicv_in_range_f32_with_fallback( #endif // OPENCV_CORE_HAL_REPLACEMENT_HPP +#ifdef OPENCV_VIDEO_HAL_REPLACEMENT_HPP + +// ScharrDeriv +// This condition can be removed if this HAL macro is defined in all supported +// versions +#ifdef cv_hal_ScharrDeriv +static inline int kleidicv_ScharrDeriv_with_fallback(const uchar *src_data, + size_t src_step, + int16_t *dst_data, + size_t dst_step, int width, + int height, int cn) { + return KLEIDICV_HAL_FALLBACK_FORWARD(scharr_deriv, cv_hal_ScharrDeriv, + src_data, src_step, dst_data, dst_step, + width, height, cn); +} +#undef cv_hal_ScharrDeriv +#define cv_hal_ScharrDeriv kleidicv_ScharrDeriv_with_fallback +#endif // cv_hal_ScharrDeriv + +#endif // OPENCV_VIDEO_HAL_REPLACEMENT_HPP + // Remove no longer needed macro definitions. #undef KLEIDICV_HAL_FALLBACK_FORWARD diff --git a/adapters/opencv/opencv-4.10.patch b/adapters/opencv/opencv-4.10.patch index 23d1dd320..06fd4ef89 100644 --- a/adapters/opencv/opencv-4.10.patch +++ b/adapters/opencv/opencv-4.10.patch @@ -19,7 +19,7 @@ index 2b4035285f..729cd1dd43 100644 @@ -281,6 +281,11 @@ void Mat::convertTo(OutputArray dst, int type_, double alpha, double beta) const dst.create(dims, size, dtype); Mat dstMat = dst.getMat(); - + + if( dims <= 2 ) { + int width_in_elements = src.cols * cn; + CALL_HAL(convertTo, cv_hal_convertTo, src.data, src.step, src.depth(), dstMat.data, dstMat.step, dstMat.depth(), width_in_elements, src.rows, alpha, beta); @@ -35,7 +35,7 @@ index f78608dbad..299b5e54bd 100644 @@ -953,6 +953,41 @@ inline int hal_ni_transpose2d(const uchar* src_data, size_t src_step, uchar* dst #define cv_hal_transpose2d hal_ni_transpose2d //! @endcond - + +/** + @brief convertTo + @param src_data,src_step,src_depth Source image @@ -72,8 +72,8 @@ index f78608dbad..299b5e54bd 100644 +//! @endcond + //! @} - - + + diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp index 8c6d8ad9a9..47eb6fdb66 100644 --- a/modules/core/src/minmax.cpp @@ -91,13 +91,13 @@ index 8c6d8ad9a9..47eb6fdb66 100644 } else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp -index 773fed9b48..145d653f5d 100644 +index 773fed9b48..b74ff70f99 100644 --- a/modules/imgproc/src/hal_replacement.hpp +++ b/modules/imgproc/src/hal_replacement.hpp @@ -328,6 +328,60 @@ inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step, #define cv_hal_remap32f hal_ni_remap32f //! @endcond - + +/** + @brief hal_remap with a short integer map + @param src_type source and destination image type @@ -172,7 +172,7 @@ index d7c9c64c3c..348208b72d 100644 + CALL_HAL(remap16s16u, cv_hal_remap16s16u, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, + map1.ptr(), map1.step, map2.ptr(), map2.step, borderType, borderValue.val); } - + interpolation &= ~WARP_RELATIVE_MAP; diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp index d0f50a73bb..1c308887dc 100644 @@ -181,7 +181,7 @@ index d0f50a73bb..1c308887dc 100644 @@ -654,6 +654,25 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType) ); - + + { + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); @@ -204,3 +204,104 @@ index d0f50a73bb..1c308887dc 100644 if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix())) { std::vector fkx, fky; +diff --git a/modules/video/src/hal_replacement.hpp b/modules/video/src/hal_replacement.hpp +new file mode 100644 +index 0000000000..e413dd3894 +--- /dev/null ++++ b/modules/video/src/hal_replacement.hpp +@@ -0,0 +1,73 @@ ++// This file is part of OpenCV project. ++// It is subject to the license terms in the LICENSE file found in the top-level directory ++// of this distribution and at http://opencv.org/license.html. ++ ++#ifndef OPENCV_VIDEO_HAL_REPLACEMENT_HPP ++#define OPENCV_VIDEO_HAL_REPLACEMENT_HPP ++ ++#include "opencv2/core/hal/interface.h" ++ ++#if defined(__clang__) // clang or MSVC clang ++#pragma clang diagnostic push ++#pragma clang diagnostic ignored "-Wunused-parameter" ++#elif defined(_MSC_VER) ++#pragma warning(push) ++#pragma warning(disable : 4100) ++#elif defined(__GNUC__) ++#pragma GCC diagnostic push ++#pragma GCC diagnostic ignored "-Wunused-parameter" ++#endif ++ ++/** ++@brief Computes Schaar derivatives with inteleaved layout xyxy... ++@param src_data source image data ++@param src_step source image step ++@param dst_data destination buffer data ++@param dst_step destination buffer step ++@param width image width ++@param height image height ++@param cn source image channels ++**/ ++inline int hal_ni_ScharrDeriv(const uchar* src_data, size_t src_step, ++ short* dst_data, size_t dst_step, ++ int width, int height, int cn) ++{ ++ return CV_HAL_ERROR_NOT_IMPLEMENTED; ++} ++ ++//! @cond IGNORED ++#define cv_hal_ScharrDeriv hal_ni_ScharrDeriv ++//! @endcond ++ ++//! @} ++ ++#if defined(__clang__) ++#pragma clang diagnostic pop ++#elif defined(_MSC_VER) ++#pragma warning(pop) ++#elif defined(__GNUC__) ++#pragma GCC diagnostic pop ++#endif ++ ++#include "custom_hal.hpp" ++ ++//! @cond IGNORED ++#define CALL_HAL_RET(name, fun, retval, ...) \ ++ int res = __CV_EXPAND(fun(__VA_ARGS__, &retval)); \ ++ if (res == CV_HAL_ERROR_OK) \ ++ return retval; \ ++ else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ ++ CV_Error_(cv::Error::StsInternal, \ ++ ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); ++ ++ ++#define CALL_HAL(name, fun, ...) \ ++ int res = __CV_EXPAND(fun(__VA_ARGS__)); \ ++ if (res == CV_HAL_ERROR_OK) \ ++ return; \ ++ else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ ++ CV_Error_(cv::Error::StsInternal, \ ++ ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); ++//! @endcond ++ ++#endif +diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp +index 6d51c0cf1a..0e6f6a324e 100644 +--- a/modules/video/src/lkpyramid.cpp ++++ b/modules/video/src/lkpyramid.cpp +@@ -50,6 +50,7 @@ + #endif + + #include "opencv2/core/openvx/ovx_defs.hpp" ++#include "hal_replacement.hpp" + + #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) + +@@ -62,6 +63,9 @@ static void calcScharrDeriv(const cv::Mat& src, cv::Mat& dst) + int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth(); + CV_Assert(depth == CV_8U); + dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn*2)); ++ ++ CALL_HAL(ScharrDeriv, cv_hal_ScharrDeriv, src.data, src.step, (short*)dst.data, dst.step, cols, rows, cn); ++ + parallel_for_(Range(0, rows), cv::detail::ScharrDerivInvoker(src, dst), cv::getNumThreads()); + } + diff --git a/conformity/opencv/CMakeLists.txt b/conformity/opencv/CMakeLists.txt index 3daa88943..347c6f1bb 100644 --- a/conformity/opencv/CMakeLists.txt +++ b/conformity/opencv/CMakeLists.txt @@ -39,6 +39,7 @@ target_link_libraries( manager opencv_core opencv_imgproc + opencv_video ) target_include_directories( @@ -47,6 +48,7 @@ target_include_directories( ${CMAKE_BINARY_DIR} ${OpenCV_SOURCE_DIR}/modules/core/include ${OpenCV_SOURCE_DIR}/modules/imgproc/include + ${OpenCV_SOURCE_DIR}/modules/video/include ) target_compile_definitions( @@ -74,6 +76,7 @@ target_link_libraries( subordinate opencv_core opencv_imgproc + opencv_video ) target_include_directories( @@ -82,6 +85,7 @@ target_include_directories( ${CMAKE_BINARY_DIR} ${OpenCV_SOURCE_DIR}/modules/core/include ${OpenCV_SOURCE_DIR}/modules/imgproc/include + ${OpenCV_SOURCE_DIR}/modules/video/include ) target_compile_definitions( diff --git a/conformity/opencv/common.h b/conformity/opencv/common.h index 681f8a6e3..add629374 100644 --- a/conformity/opencv/common.h +++ b/conformity/opencv/common.h @@ -21,6 +21,7 @@ #include "opencv2/core.hpp" #include "opencv2/imgproc.hpp" +#include "opencv2/video.hpp" #define KLEIDICV_CONFORMITY_SHM_ID "/opencv_kleidicv_conformity_check_shm" #define KLEIDICV_CONFORMITY_SHM_SIZE (1024 * 1024) diff --git a/conformity/opencv/test_scharr.cpp b/conformity/opencv/test_scharr.cpp new file mode 100644 index 000000000..9f2630718 --- /dev/null +++ b/conformity/opencv/test_scharr.cpp @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "tests.h" + +cv::Mat exec_scharr_interleaved(cv::Mat& input) { + std::vector pyramid; + // `winSize` is not take into account when `maxLevel` is set to 0. + cv::buildOpticalFlowPyramid(input, pyramid, cv::Size(3, 3), 0, true); + return pyramid[1].clone(); +} + +#if MANAGER +bool test_scharr_interleaved(int index, RecreatedMessageQueue& request_queue, + RecreatedMessageQueue& reply_queue) { + cv::RNG rng(0); + + for (size_t x = 1; x <= 16; ++x) { + for (size_t y = 1; y <= 16; ++y) { + cv::Mat input(x, y, CV_8UC1); + rng.fill(input, cv::RNG::UNIFORM, 0, 255); + + cv::Mat actual = exec_scharr_interleaved(input); + cv::Mat expected = get_expected_from_subordinate(index, request_queue, + reply_queue, input); + + if (are_matrices_different(0, actual, expected)) { + fail_print_matrices(x, y, input, actual, expected); + return true; + } + } + } + + return false; +} +#endif + +std::vector& scharr_interleaved_tests_get() { + // clang-format off + static std::vector tests = { + TEST("Scharr Interleaved, 1 channel", test_scharr_interleaved, exec_scharr_interleaved) + }; + // clang-format on + return tests; +} diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp index 14ccc52c6..b2648707f 100644 --- a/conformity/opencv/tests.cpp +++ b/conformity/opencv/tests.cpp @@ -41,6 +41,7 @@ std::vector all_tests = merge_tests({ in_range_tests_get, remap_tests_get, blur_and_downsample_tests_get, + scharr_interleaved_tests_get, // clang-format on }); diff --git a/conformity/opencv/tests.h b/conformity/opencv/tests.h index c1b3c8a4d..e148873a2 100644 --- a/conformity/opencv/tests.h +++ b/conformity/opencv/tests.h @@ -24,5 +24,6 @@ std::vector& min_max_tests_get(); std::vector& in_range_tests_get(); std::vector& remap_tests_get(); std::vector& blur_and_downsample_tests_get(); +std::vector& scharr_interleaved_tests_get(); #endif // KLEIDICV_OPENCV_CONFORMITY_TESTS_H_ diff --git a/doc/opencv.md b/doc/opencv.md index d56d7dedd..0c5e6b789 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -209,3 +209,6 @@ Blurs and downsamples an image. Notes on parameters: * `src.depth()` - only supports `CV_8U` and 1 channel. * if `dstsize` is specified it must be equal to `Size((src.cols + 1) / 2, (src.rows + 1) / 2)` + +### [`cv::buildOpticalFlowPyramid()`](https://docs.opencv.org/4.10.0/dc/d6b/group__video__track.html#ga86640c1c470f87b2660c096d2b22b2ce) +Constructs an image pyramid which can be passed to `cv::calcOpticalFlowPyrLK`. diff --git a/kleidicv/include/kleidicv/filters/scharr.h b/kleidicv/include/kleidicv/filters/scharr.h new file mode 100644 index 000000000..a81218d20 --- /dev/null +++ b/kleidicv/include/kleidicv/filters/scharr.h @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_FILTERS_SCHARR_H +#define KLEIDICV_FILTERS_SCHARR_H + +#include "kleidicv/config.h" +#include "kleidicv/kleidicv.h" + +extern "C" { +// For internal use only. See instead kleidicv_scharr_interleaved_s16_u8. +// Filter a horizontal stripe across an image. The stripe is defined by the +// range (y_begin, y_end]. +KLEIDICV_API_DECLARATION(kleidicv_scharr_interleaved_stripe_s16_u8, + const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, + size_t y_begin, size_t y_end); +} + +namespace kleidicv { + +inline bool scharr_interleaved_is_implemented(size_t src_width, + size_t src_height, + size_t src_channels) { + return src_width > 2 && src_height > 2 && src_channels == 1; +} + +namespace neon { + +kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, + size_t y_end); + +} // namespace neon + +namespace sve2 {} // namespace sve2 + +namespace sme2 {} // namespace sme2 + +} // namespace kleidicv + +#endif // KLEIDICV_FILTERS_SCHARR_H diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index a2a0848b9..e76dfc8b7 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -1780,6 +1780,61 @@ KLEIDICV_API_DECLARATION(kleidicv_remap_s16point5_u8, const uint8_t *src, kleidicv_border_values_t border_values); #endif // DOXYGEN +#ifndef DOXYGEN +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Calculates horizontal and vertical derivative approximation with Scharr +/// filter and store the results interleaved. +/// +/// The horizontal convolution kernel is: +/// ``` +/// [ 3 0 -3 ] +/// [ 10 0 -10 ] +/// [ 3 0 -3 ] +/// ``` +/// +/// The vertical convolution kernel is: +/// ``` +/// [ 3 10 3 ] +/// [ 0 0 0 ] +/// [ -3 -10 -3 ] +/// ``` +/// +/// Note, that the kernels are mirrored both vertically and horizontally during +/// the convolution. +/// +/// This API does not handle borders, so the result's width and height is `width +/// - 2` and `height - 2`, respectively. Number of pixels in the source is +/// limited to @ref KLEIDICV_MAX_IMAGE_PIXELS. Result's channel count is the +/// double of the source' channel count, as the calculated derivative +/// approximations are stored interleaved: +/// ``` +/// | dx,dy | dx,dy | dx,dy | ... +/// ``` +/// Where `dx` is the horizontal derivative approximation and `dy` is the +/// vertical derivative approximation. +/// +/// @param src Pointer to the source data. Must be non-null. +/// @param src_stride Distance in bytes from the start of one row to the +/// start of the next row in the source data. Must be a +/// multiple of `sizeof(src type)` and no less than `width * +/// sizeof(src type) * channels`. +/// @param src_width Number of columns in the source. Must be more than 2. +/// (One column consists of `channels` number of elements.) +/// @param src_height Number of rows in the source. Must be more than 2. +/// @param src_channels Number of channels in the source data. Must be equal +/// to 1. +/// @param dst Pointer to the destination data. Must be non-null. +/// @param dst_stride Distance in bytes from the start of one row to the +/// start of the next row in the destination data. Must be a +/// multiple of `sizeof(dst type)` and no less than `(width +/// - 2) * sizeof(dst type) * channels`. +/// +kleidicv_error_t kleidicv_scharr_interleaved_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride); +#endif // DOXYGEN + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/kleidicv/src/filters/scharr_api.cpp b/kleidicv/src/filters/scharr_api.cpp new file mode 100644 index 000000000..b49836e91 --- /dev/null +++ b/kleidicv/src/filters/scharr_api.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/dispatch.h" +#include "kleidicv/filters/scharr.h" +#include "kleidicv/kleidicv.h" + +KLEIDICV_MULTIVERSION_C_API( + kleidicv_scharr_interleaved_stripe_s16_u8, + &kleidicv::neon::kleidicv_scharr_interleaved_stripe_s16_u8, nullptr, + nullptr); + +extern "C" { + +kleidicv_error_t kleidicv_scharr_interleaved_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride) { + if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height, + src_channels)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + // height is decremented by 2 as the result has less rows. + return kleidicv_scharr_interleaved_stripe_s16_u8( + src, src_stride, src_width, src_height, src_channels, dst, dst_stride, 0, + src_height - 2); +} + +} // extern "C" diff --git a/kleidicv/src/filters/scharr_neon.cpp b/kleidicv/src/filters/scharr_neon.cpp new file mode 100644 index 000000000..cc8bfb7bb --- /dev/null +++ b/kleidicv/src/filters/scharr_neon.cpp @@ -0,0 +1,228 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include + +#include "kleidicv/config.h" +#include "kleidicv/ctypes.h" +#include "kleidicv/filters/scharr.h" +#include "kleidicv/neon.h" +#include "kleidicv/types.h" +#include "kleidicv/utils.h" + +namespace kleidicv::neon { + +// Scharr filtering in both horizontal and vertical directions, horizontal and +// vertical derivative approximations are stored interleaved. +// +// The applied weights for the horizontal approximation, as the kernel is +// mirrored both vertically and horizontally during the convolution: +// [ -3 0 3 ] [ 3 ] +// F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ] +// [ -3 0 3 ] [ 3 ] +// +// The applied weights for the vertical approximation, as the kernel is mirrored +// both vertically and horizontally during the convolution: +// [ -3 -10 -3 ] [ -1 ] +// F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ] +// [ 3 10 3 ] [ 1 ] +// +class ScharrInterleaved { + using SourceType = uint8_t; + using SourceVecTraits = VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferType = int16_t; + using BufferVecTraits = VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using BufferVector4Type = typename BufferVecTraits::Vector4Type; + using DestinationType = int16_t; + using DestinationVecTraits = VecTraits; + using DestinationVectorType = typename DestinationVecTraits::VectorType; + + public: + ScharrInterleaved(Rows hori_deriv_buffer, + Rows vert_deriv_buffer, size_t width) + : hori_deriv_buffer_(hori_deriv_buffer), + vert_deriv_buffer_(vert_deriv_buffer), + width_(width), + const_3_s16_(vdupq_n_s16(3)), + const_10_u8_(vdupq_n_u8(10)), + const_10_s16_(vdupq_n_s16(10)) {} + + void process(Rows src_rows, Rows dst_rows, + size_t y_begin, size_t y_end) { + for (size_t i = y_begin; i < y_end; ++i) { + process_vertical(src_rows.at(static_cast(i))); + process_horizontal(dst_rows.at(static_cast(i))); + } + } + + private: + BufferVector4Type vertical_vector_path(SourceVectorType src[3]) { + // Horizontal derivative approximation + uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2])); + uint16x8_t hori_acc_h = + vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[2])); + + hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_); + hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_); + + hori_acc_l = + vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_)); + hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_); + + // Vertical derivative approximation + uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0])); + uint16x8_t vert_acc_h = + vsubl_u8(vget_high_u8(src[2]), vget_high_u8(src[0])); + + return { + vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h), + vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)}; + } + + void process_vertical(Rows src_rows) { + LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes}; + + loop.unroll_once([&](ptrdiff_t index) { + SourceVectorType src[3]; + src[0] = vld1q(&src_rows.at(0)[index]); + src[1] = vld1q(&src_rows.at(1)[index]); + src[2] = vld1q(&src_rows.at(2)[index]); + + BufferVector4Type res = vertical_vector_path(src); + + vst1q(&hori_deriv_buffer_[index], res.val[0]); + vst1q(&hori_deriv_buffer_[index + kBufferVecNumLanes], res.val[1]); + vst1q(&vert_deriv_buffer_[index], res.val[2]); + vst1q(&vert_deriv_buffer_[index + kBufferVecNumLanes], res.val[3]); + }); + + loop.tail([&](ptrdiff_t index) { + hori_deriv_buffer_[index] = static_cast( + (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 + + src_rows.at(1)[index] * 10); + + vert_deriv_buffer_[index] = static_cast( + src_rows.at(2)[index] - src_rows.at(0)[index]); + }); + } + + DestinationVectorType horizontal_vector_path_hori_approx( + BufferVectorType buff[2]) { + return vsubq_s16(buff[1], buff[0]); + } + + DestinationVectorType horizontal_vector_path_vert_approx( + BufferVectorType buff[3]) { + BufferVectorType a = vaddq_u16(buff[0], buff[2]); + a = vaddq_u16(a, vaddq_u16(a, a)); + return vmlaq_u16(a, buff[1], const_10_s16_); + } + + void process_horizontal(Rows dst_rows) { + // width is decremented by 2 as the result has less columns. + LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(), + kBufferVecNumLanes}; + + loop.unroll_once([&](ptrdiff_t index) { + BufferVectorType hori_buff[2]; + hori_buff[0] = vld1q(&hori_deriv_buffer_[index]); + hori_buff[1] = vld1q(&hori_deriv_buffer_[index + 2]); + DestinationVectorType hori_approx_res = + horizontal_vector_path_hori_approx(hori_buff); + + BufferVectorType vert_buff[3]; + vert_buff[0] = vld1q(&vert_deriv_buffer_[index]); + vert_buff[1] = vld1q(&vert_deriv_buffer_[index + 1]); + vert_buff[2] = vld1q(&vert_deriv_buffer_[index + 2]); + DestinationVectorType vert_approx_res = + horizontal_vector_path_vert_approx(vert_buff); + + vst1q(&dst_rows.at(0, index)[0], + vzip1q_s16(hori_approx_res, vert_approx_res)); + vst1q(&dst_rows.at(0, index)[DestinationVecTraits::num_lanes()], + vzip2q_s16(hori_approx_res, vert_approx_res)); + }); + + loop.tail([&](ptrdiff_t index) { + dst_rows.at(0, index)[0] = static_cast( + // For some reason clang-tidy thinks these accesses are invalid + // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign, + // clang-analyzer-core.UndefinedBinaryOperatorResult) + hori_deriv_buffer_[index + 2] - hori_deriv_buffer_[index]); + // NOLINTEND(clang-analyzer-core.uninitialized.Assign, + // clang-analyzer-core.UndefinedBinaryOperatorResult) + + dst_rows.at(0, index)[1] = static_cast( + (vert_deriv_buffer_[index] + vert_deriv_buffer_[index + 2]) * 3 + + vert_deriv_buffer_[index + 1] * 10); + }); + } + + Rows hori_deriv_buffer_; + Rows vert_deriv_buffer_; + size_t width_; + int16x8_t const_3_s16_; + uint8x16_t const_10_u8_; + int16x8_t const_10_s16_; + + static constexpr ptrdiff_t kSourceVecNumLanes = + static_cast(SourceVecTraits::num_lanes()); + static constexpr ptrdiff_t kBufferVecNumLanes = + static_cast(BufferVecTraits::num_lanes()); +}; // end of class ScharrInterleaved + +class ScharrBufferDeleter { + public: + void operator()(void *ptr) const { std::free(ptr); } +}; + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, + size_t y_end) { + // Does not include checks for whether the operation is implemented. + // This must be done earlier, by scharr_interleaved_is_implemented. + CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height); + CHECK_IMAGE_SIZE(src_width, src_height); + + size_t buffer_stride = src_width * src_channels * sizeof(int16_t); + // Buffer has two rows, one for the horizontal derivative approximation, one + // for the vertical one. + size_t buffer_height = 2; + // Memory is allocated with malloc to avoid its initialization. + void *allocation = std::malloc(buffer_stride * buffer_height); + + if (!allocation) { + return KLEIDICV_ERROR_ALLOCATION; + } + + std::unique_ptr buffer( + reinterpret_cast(allocation)); + + Rows src_rows{src, src_stride, src_channels}; + + // Result is treated as it has double the channel number compared to the + // input. + Rows dst_rows{dst, dst_stride, src_channels * 2}; + + Rows hori_deriv_buffer{buffer.get(), buffer_stride, src_channels}; + + int16_t *vert_deriv_ptr = reinterpret_cast( + reinterpret_cast(buffer.get()) + buffer_stride); + Rows vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels}; + + ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width) + .process(src_rows, dst_rows, y_begin, y_end); + + return KLEIDICV_OK; +} + +} // namespace kleidicv::neon diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index f96c488c5..55d3cff66 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -339,6 +339,15 @@ kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8( size_t width, size_t height, size_t channels, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_scharr_interleaved_s16_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, + kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 1c9263cf3..ca6071134 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -11,6 +11,7 @@ #include "kleidicv/filters/blur_and_downsample.h" #include "kleidicv/filters/gaussian_blur.h" +#include "kleidicv/filters/scharr.h" #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" @@ -558,6 +559,25 @@ kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8( return parallel_batches(callback, mt, height); } +kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, + kleidicv_thread_multithreading mt) { + if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height, + src_channels)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + auto callback = [=](unsigned y_begin, unsigned y_end) { + return kleidicv_scharr_interleaved_stripe_s16_u8( + src, src_stride, src_width, src_height, src_channels, dst, dst_stride, + y_begin, y_end); + }; + + // height is decremented by 2 as the result has less rows. + return parallel_batches(callback, mt, src_height - 2); +} + kleidicv_error_t kleidicv_thread_resize_to_quarter_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index d40d2dd9c..dfc095385 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -80,3 +80,5 @@ Remap_S16_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER Remap_S16Point5_U8: opencv_perf_imgproc '*Remap/*' '($PIXEL_FORMAT, 8UC1, 16SC2, INTER_LINEAR, BORDER_REPLICATE)' BlurAndDownsample: opencv_perf_imgproc '*pyrDown/*' '($PIXEL_FORMAT, 8UC1)' + +ScharrInterleaved: opencv_perf_video '*calcScharrDeriv/*' '($PIXEL_FORMAT)' diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh index 9ceb19644..d0a7dc004 100755 --- a/scripts/ci-opencv.sh +++ b/scripts/ci-opencv.sh @@ -69,7 +69,10 @@ CLEAN="ON" \ # ------------------------------------------------------------------------------ # Build OpenCV test executables from already configured conformity check project # The OpenCV source is patched in this case -ninja -C build/conformity/opencv_kleidicv opencv_test_imgproc opencv_test_core +ninja -C build/conformity/opencv_kleidicv \ + opencv_test_imgproc \ + opencv_test_core \ + opencv_test_video # Some tests require opencv_extra for the test images tar xf /opt/opencv-extra-${OPENCV_VERSION}.tar.gz -C build @@ -125,6 +128,13 @@ CORE_TEST_PATTERNS_STR="$(join_strings_with_colon "${CORE_TEST_PATTERNS[*]}")" ../../../conformity/opencv_kleidicv/bin/opencv_test_core \ --gtest_filter="${CORE_TEST_PATTERNS_STR}" || TESTRESULT=1 +VIDEO_TEST_PATTERNS=( + 'Video_OpticalFlowPyrLK.accuracy' +) +VIDEO_TEST_PATTERNS_STR="$(join_strings_with_colon "${VIDEO_TEST_PATTERNS[*]}")" +../../../conformity/opencv_kleidicv/bin/opencv_test_video \ + --gtest_filter="${VIDEO_TEST_PATTERNS_STR}" || TESTRESULT=1 + popd exit $TESTRESULT diff --git a/scripts/run_opencv_conformity_checks.sh b/scripts/run_opencv_conformity_checks.sh index 87248cf31..ab7183d1d 100755 --- a/scripts/run_opencv_conformity_checks.sh +++ b/scripts/run_opencv_conformity_checks.sh @@ -29,7 +29,7 @@ common_cmake_args=( "-DBUILD_SHARED_LIBS=OFF" "-DBUILD_TESTS=ON" "-DBUILD_PERF_TESTS=OFF" - "-DBUILD_LIST=imgproc,core,ts" + "-DBUILD_LIST=imgproc,core,video,ts" "-DCV_TRACE=OFF" "-DBUILD_EXAMPLES=OFF" diff --git a/test/api/test_scharr.cpp b/test/api/test_scharr.cpp new file mode 100644 index 000000000..4c2b8ea1c --- /dev/null +++ b/test/api/test_scharr.cpp @@ -0,0 +1,191 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "framework/kernel.h" +#include "framework/types.h" +#include "framework/utils.h" +#include "kleidicv/ctypes.h" +#include "kleidicv/kleidicv.h" +#include "test_config.h" + +// Test for Scharr operator with interleaved output. +class ScharrInterleavedTest { + using InputType = uint8_t; + using IntermediateType = int16_t; + using OutputType = int16_t; + + public: + ScharrInterleavedTest() + : vertical_mask_{create_vertical_mask()}, + horizontal_mask_{create_horizontal_mask()} {} + + void test() { + // Use the default array layouts with one channel for testing. + for (auto layout : test::default_1channel_array_layouts(3, 3)) { + test(layout); + } + } + + private: + static test::Array2D create_vertical_mask() { + test::Array2D mask{3, 3}; + + mask.set(0, 0, {-3, -10, -3}); + mask.set(1, 0, {0, 0, 0}); + mask.set(2, 0, {3, 10, 3}); + + return mask; + } + + static test::Array2D create_horizontal_mask() { + test::Array2D mask{3, 3}; + + mask.set(0, 0, {-3, 0, 3}); + mask.set(1, 0, {-10, 0, 10}); + mask.set(2, 0, {-3, 0, 3}); + + return mask; + } + + void test(const test::ArrayLayout& layout) { + // Input + input_ = test::Array2D{layout}; + ASSERT_TRUE(input_.valid()); + test::PseudoRandomNumberGenerator element_generator; + input_.fill(element_generator); + + // Output has less rows and columns as borders are not handled in the same + // way as in case of most of the other filter operations. + test::ArrayLayout output_layout{(layout.width - 2) * 2, layout.height - 2, + layout.padding, layout.channels * 2}; + + // Expected + expected_ = test::Array2D{output_layout}; + ASSERT_TRUE(expected_.valid()); + calculate_expected(); + + // Actual + actual_ = test::Array2D{output_layout}; + ASSERT_TRUE(actual_.valid()); + EXPECT_EQ(KLEIDICV_OK, + kleidicv_scharr_interleaved_s16_u8( + input_.data(), input_.stride(), + input_.width() / input_.channels(), input_.height(), + input_.channels(), actual_.data(), actual_.stride())); + + // Check results + EXPECT_EQ_ARRAY2D(expected_, actual_); + } + + void calculate_expected() { + for (size_t row = 0; row < expected_.height(); ++row) { + for (size_t column = 0; column < expected_.width(); column += 2) { + IntermediateType horizontal_result = + calculate_expected_at(horizontal_mask_, input_, row, column / 2); + expected_.at(row, column)[0] = + static_cast(horizontal_result); + + IntermediateType vertical_result = + calculate_expected_at(vertical_mask_, input_, row, column / 2); + expected_.at(row, column)[1] = static_cast(vertical_result); + } + } + } + + IntermediateType calculate_expected_at( + const test::Kernel& kernel, + const test::TwoDimensional& source, size_t row, + size_t column) { + IntermediateType result{0}; + for (size_t height = 0; height < kernel.height(); ++height) { + for (size_t width = 0; width < kernel.width(); ++width) { + IntermediateType coefficient = kernel.at(height, width)[0]; + InputType value = + source.at(row + height, column + width * source.channels())[0]; + result = test::saturating_add( + result, test::saturating_mul(coefficient, + static_cast(value))); + } + } + + return result; + } + + const test::Kernel vertical_mask_; + const test::Kernel horizontal_mask_; + + test::Array2D input_; + test::Array2D expected_; + test::Array2D actual_; +}; // end of class ScharrInterleavedTest + +// Tests kleidicv_scharr_interleaved_s16_u8 API. +TEST(ScharrInterleaved, API) { ScharrInterleavedTest{}.test(); } + +TEST(ScharrInterleaved, NullPointer) { + uint8_t src[1] = {}; + int16_t dst[1]; + test::test_null_args(kleidicv_scharr_interleaved_s16_u8, src, sizeof(uint8_t), + 3, 3, 1, dst, sizeof(int16_t)); +} + +TEST(ScharrInterleaved, Misalignment) { + uint8_t src[1] = {}; + int16_t dst[1]; + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + kleidicv_scharr_interleaved_s16_u8(src, sizeof(uint8_t), 3, 4, 1, + dst, 1)); +} + +TEST(ScharrInterleaved, UndersizedImage) { + uint8_t src[1] = {}; + int16_t dst[1]; + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + kleidicv_scharr_interleaved_s16_u8(src, sizeof(uint8_t), 1, 1, 1, + dst, sizeof(int16_t))); + + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + kleidicv_scharr_interleaved_s16_u8(src, sizeof(uint8_t), 3, 1, 1, + dst, sizeof(int16_t))); +} + +TEST(ScharrInterleaved, OversizedImage) { + uint8_t src[1] = {}; + int16_t dst[1]; + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + kleidicv_scharr_interleaved_s16_u8(src, sizeof(uint8_t), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 3, + 1, dst, sizeof(int16_t))); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + kleidicv_scharr_interleaved_s16_u8( + src, sizeof(uint8_t), KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS, 1, dst, sizeof(int16_t))); +} + +TEST(ScharrInterleaved, ChannelNumber) { + uint8_t src[1] = {}; + int16_t dst[1]; + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + kleidicv_scharr_interleaved_s16_u8(src, sizeof(uint8_t), 1, 1, 2, + dst, sizeof(int16_t))); +} + +#ifdef KLEIDICV_ALLOCATION_TESTS +TEST(ScharrInterleaved, Allocation) { + uint8_t src[1] = {}; + int16_t dst[1]; + MockMallocToFail::enable(); + kleidicv_error_t ret = kleidicv_scharr_interleaved_s16_u8( + src, sizeof(uint8_t), 3, 3, 1, dst, sizeof(int16_t)); + MockMallocToFail::disable(); + + EXPECT_EQ(KLEIDICV_ERROR_ALLOCATION, ret); +} +#endif diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 86d1ebd7d..48af96271 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -425,6 +425,49 @@ TEST(ThreadBlurAndDownsample, NotImplemented) { ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } +TEST_P(Thread, scharr_interleaved_s16_u8) { + unsigned src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + + // Minimal width and height is 3 + src_width += 2; + src_height += 2; + + size_t src_channels = 1; + + test::Array2D src(size_t{src_width} * src_channels, src_height); + test::Array2D dst_single(size_t{src_width - 2} * src_channels * 2, + size_t{src_height - 2}), + dst_multi(size_t{src_width - 2} * src_channels * 2, + size_t{src_height - 2}); + + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + + kleidicv_error_t single_result = kleidicv_scharr_interleaved_s16_u8( + src.data(), src.stride(), src_width, src_height, src_channels, + dst_single.data(), dst_single.stride()); + + kleidicv_error_t multi_result = kleidicv_thread_scharr_interleaved_s16_u8( + src.data(), src.stride(), src_width, src_height, src_channels, + dst_multi.data(), dst_multi.stride(), + get_multithreading_fake(thread_count)); + + EXPECT_EQ(single_result, multi_result); + if (KLEIDICV_OK == single_result) { + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); + } +} + +TEST(ThreadScharrInterleaved, NotImplemented) { + uint8_t src[1] = {}; + int16_t dst[1] = {}; + // Multichannel input + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + kleidicv_thread_scharr_interleaved_s16_u8( + src, 1, 1, 1, 2, dst, sizeof(dst), get_multithreading_fake(2))); +} + TEST_P(Thread, separable_filter_2d_u8) { check_separable_filter_2d(kleidicv_separable_filter_2d_u8, kleidicv_thread_separable_filter_2d_u8); -- GitLab From a29aff259d133b5ef913214cd574939218996885 Mon Sep 17 00:00:00 2001 From: Mark Horvath Date: Fri, 8 Nov 2024 21:38:59 +0000 Subject: [PATCH 2/2] Add SVE2/SME2 version of scharr_interleaved_s16_u8 --- benchmark/benchmark.cpp | 10 + kleidicv/include/kleidicv/filters/scharr.h | 16 +- kleidicv/src/filters/scharr_api.cpp | 6 +- kleidicv/src/filters/scharr_sc.h | 205 +++++++++++++++++++++ kleidicv/src/filters/scharr_sme2.cpp | 21 +++ kleidicv/src/filters/scharr_sve2.cpp | 20 ++ 6 files changed, 274 insertions(+), 4 deletions(-) create mode 100644 kleidicv/src/filters/scharr_sc.h create mode 100644 kleidicv/src/filters/scharr_sme2.cpp create mode 100644 kleidicv/src/filters/scharr_sve2.cpp diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 1c608b13d..803fab42d 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -485,6 +485,16 @@ static void blur_and_downsample_u8(benchmark::State& state) { } BENCHMARK(blur_and_downsample_u8); +static void scharr_interleaved_s16_u8(benchmark::State& state) { + bench_functor(state, []() { + (void)kleidicv_scharr_interleaved_s16_u8( + get_source_buffer_a(), image_width * sizeof(uint8_t), + image_width, image_height, 1, get_destination_buffer(), + (image_width - 2) * sizeof(int16_t)); + }); +} +BENCHMARK(scharr_interleaved_s16_u8); + template static const ScalarType* get_random_mapxy() { auto generate_mapxy = [&]() { diff --git a/kleidicv/include/kleidicv/filters/scharr.h b/kleidicv/include/kleidicv/filters/scharr.h index a81218d20..e6bde054c 100644 --- a/kleidicv/include/kleidicv/filters/scharr.h +++ b/kleidicv/include/kleidicv/filters/scharr.h @@ -36,9 +36,21 @@ kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( } // namespace neon -namespace sve2 {} // namespace sve2 +namespace sve2 { -namespace sme2 {} // namespace sme2 +kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, + size_t y_end); +} // namespace sve2 + +namespace sme2 { + +kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, + size_t y_end); +} // namespace sme2 } // namespace kleidicv diff --git a/kleidicv/src/filters/scharr_api.cpp b/kleidicv/src/filters/scharr_api.cpp index b49836e91..b6a06bfef 100644 --- a/kleidicv/src/filters/scharr_api.cpp +++ b/kleidicv/src/filters/scharr_api.cpp @@ -8,8 +8,10 @@ KLEIDICV_MULTIVERSION_C_API( kleidicv_scharr_interleaved_stripe_s16_u8, - &kleidicv::neon::kleidicv_scharr_interleaved_stripe_s16_u8, nullptr, - nullptr); + &kleidicv::neon::kleidicv_scharr_interleaved_stripe_s16_u8, + KLEIDICV_SVE2_IMPL_IF( + &kleidicv::sve2::kleidicv_scharr_interleaved_stripe_s16_u8), + &kleidicv::sme2::kleidicv_scharr_interleaved_stripe_s16_u8); extern "C" { diff --git a/kleidicv/src/filters/scharr_sc.h b/kleidicv/src/filters/scharr_sc.h new file mode 100644 index 000000000..836d52ba8 --- /dev/null +++ b/kleidicv/src/filters/scharr_sc.h @@ -0,0 +1,205 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include +#include + +#include "kleidicv/config.h" +#include "kleidicv/ctypes.h" +#include "kleidicv/sve2.h" +#include "kleidicv/types.h" +#include "kleidicv/utils.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +// Scharr filtering in both horizontal and vertical directions, horizontal and +// vertical derivative approximations are stored interleaved. +// +// The applied weights for the horizontal approximation, as the kernel is +// mirrored both vertically and horizontally during the convolution: +// [ -3 0 3 ] [ 3 ] +// F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ] +// [ -3 0 3 ] [ 3 ] +// +// The applied weights for the vertical approximation, as the kernel is mirrored +// both vertically and horizontally during the convolution: +// [ -3 -10 -3 ] [ -1 ] +// F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ] +// [ 3 10 3 ] [ 1 ] +// +class ScharrInterleaved { + using SourceType = uint8_t; + using SourceVecTraits = VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; + using BufferType = int16_t; + using BufferVecTraits = VecTraits; + using BufferVectorType = typename BufferVecTraits::VectorType; + using DestinationType = int16_t; + using DestinationVecTraits = VecTraits; + using DestinationVectorType = typename DestinationVecTraits::VectorType; + + public: + ScharrInterleaved(Rows hori_deriv_buffer, + Rows vert_deriv_buffer, + size_t width) KLEIDICV_STREAMING_COMPATIBLE + : hori_deriv_buffer_(hori_deriv_buffer), + vert_deriv_buffer_(vert_deriv_buffer), + width_(width) {} + + void process(Rows src_rows, Rows dst_rows, + size_t y_begin, size_t y_end) KLEIDICV_STREAMING_COMPATIBLE { + for (size_t i = y_begin; i < y_end; ++i) { + process_vertical(src_rows.at(static_cast(i))); + process_horizontal(dst_rows.at(static_cast(i))); + } + } + + private: + void vertical_vector_path(svbool_t pg, Rows src_rows, + ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + SourceVectorType src_0 = svld1(pg, &src_rows.at(0)[index]); + SourceVectorType src_1 = svld1(pg, &src_rows.at(1)[index]); + SourceVectorType src_2 = svld1(pg, &src_rows.at(2)[index]); + + // Horizontal derivative approximation + svuint16_t hori_acc_b = svaddlb(src_0, src_2); + svuint16_t hori_acc_t = svaddlt(src_0, src_2); + + hori_acc_b = svmul_n_u16_x(pg, hori_acc_b, 3); + hori_acc_t = svmul_n_u16_x(pg, hori_acc_t, 3); + + hori_acc_b = svmlalb_n_u16(hori_acc_b, src_1, 10); + hori_acc_t = svmlalt_n_u16(hori_acc_t, src_1, 10); + + svint16x2_t hori_interleaved = + svcreate2(svreinterpret_s16(hori_acc_b), svreinterpret_s16(hori_acc_t)); + svst2(pg, &hori_deriv_buffer_[index], hori_interleaved); + + // Vertical derivative approximation + svuint16_t vert_acc_b = svsublb(src_2, src_0); + svuint16_t vert_acc_t = svsublt(src_2, src_0); + + svint16x2_t vert_interleaved = + svcreate2(svreinterpret_s16(vert_acc_b), svreinterpret_s16(vert_acc_t)); + svst2(pg, &vert_deriv_buffer_[index], vert_interleaved); + } + + void process_vertical(Rows src_rows) + KLEIDICV_STREAMING_COMPATIBLE { + LoopUnroll2 loop{width_ * src_rows.channels(), + SourceVecTraits::num_lanes()}; + svbool_t pg_all = SourceVecTraits::svptrue(); + + loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + vertical_vector_path(pg_all, src_rows, index); + }); + + loop.remaining( + [&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = SourceVecTraits::svwhilelt(index, length); + vertical_vector_path(pg, src_rows, index); + }); + } + + void horizontal_vector_path(svbool_t pg, Rows dst_rows, + ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + // Horizontal derivative approximation + BufferVectorType hori_buff_0 = svld1(pg, &hori_deriv_buffer_[index]); + BufferVectorType hori_buff_2 = svld1(pg, &hori_deriv_buffer_[index + 2]); + + DestinationVectorType hori_result = svsub_x(pg, hori_buff_2, hori_buff_0); + + // Vertical derivative approximation + BufferVectorType vert_buff_0 = svld1(pg, &vert_deriv_buffer_[index]); + BufferVectorType vert_buff_1 = svld1(pg, &vert_deriv_buffer_[index + 1]); + BufferVectorType vert_buff_2 = svld1(pg, &vert_deriv_buffer_[index + 2]); + + DestinationVectorType vert_result = svadd_x(pg, vert_buff_0, vert_buff_2); + vert_result = svmul_n_s16_x(pg, vert_result, 3); + vert_result = svmad_s16_x(pg, vert_buff_1, svdup_n_s16(10), vert_result); + + // Store results + svint16x2_t interleaved_result = svcreate2(hori_result, vert_result); + svst2(pg, &dst_rows.at(0, index)[0], interleaved_result); + } + + void process_horizontal(Rows dst_rows) + KLEIDICV_STREAMING_COMPATIBLE { + // width is decremented by 2 as the result has less columns. + LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(), + BufferVecTraits::num_lanes()}; + svbool_t pg_all = BufferVecTraits::svptrue(); + + loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING_COMPATIBLE { + horizontal_vector_path(pg_all, dst_rows, index); + }); + + loop.remaining( + [&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = BufferVecTraits::svwhilelt(index, length); + horizontal_vector_path(pg, dst_rows, index); + }); + } + + Rows hori_deriv_buffer_; + Rows vert_deriv_buffer_; + size_t width_; +}; // end of class ScharrInterleaved + +class ScharrBufferDeleter { + public: + void operator()(void *ptr) const { std::free(ptr); } +}; + +static kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8_sc( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, + size_t y_end) KLEIDICV_STREAMING_COMPATIBLE { + // Does not include checks for whether the operation is implemented. + // This must be done earlier, by scharr_interleaved_is_implemented. + CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height); + CHECK_IMAGE_SIZE(src_width, src_height); + + // Allocating more elements because in case of SVE interleaving stores are + // governed by one predicate. For example, if a predicate requires 7 uint8_t + // elements and an algorithm performs widening to 16 bits, the resulting + // interleaving store will still be governed by the same predicate, thus + // storing 8 elements. Choosing '3' to account for svst4(). + size_t buffer_stride = ((src_width * src_channels) + 3) * sizeof(int16_t); + // Buffer has two rows, one for the horizontal derivative approximation, one + // for the vertical one. + size_t buffer_height = 2; + // Memory is allocated with malloc to avoid its initialization. + void *allocation = std::malloc(buffer_stride * buffer_height); + + if (!allocation) { + return KLEIDICV_ERROR_ALLOCATION; + } + + std::unique_ptr buffer( + reinterpret_cast(allocation)); + + Rows src_rows{src, src_stride, src_channels}; + + // Result is treated as it has double the channel number compared to the + // input. + Rows dst_rows{dst, dst_stride, src_channels * 2}; + + Rows hori_deriv_buffer{buffer.get(), buffer_stride, src_channels}; + + int16_t *vert_deriv_ptr = reinterpret_cast( + reinterpret_cast(buffer.get()) + buffer_stride); + Rows vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels}; + + ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width) + .process(src_rows, dst_rows, y_begin, y_end); + + return KLEIDICV_OK; +} +} // namespace KLEIDICV_TARGET_NAMESPACE diff --git a/kleidicv/src/filters/scharr_sme2.cpp b/kleidicv/src/filters/scharr_sme2.cpp new file mode 100644 index 000000000..be51c94ed --- /dev/null +++ b/kleidicv/src/filters/scharr_sme2.cpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/filters/scharr.h" +#include "scharr_sc.h" + +namespace kleidicv::sme2 { + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +kleidicv_scharr_interleaved_stripe_s16_u8(const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, + size_t dst_stride, size_t y_begin, + size_t y_end) { + return kleidicv_scharr_interleaved_stripe_s16_u8_sc( + src, src_stride, src_width, src_height, src_channels, dst, dst_stride, + y_begin, y_end); +} + +} // namespace kleidicv::sme2 diff --git a/kleidicv/src/filters/scharr_sve2.cpp b/kleidicv/src/filters/scharr_sve2.cpp new file mode 100644 index 000000000..366af2900 --- /dev/null +++ b/kleidicv/src/filters/scharr_sve2.cpp @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/filters/scharr.h" +#include "scharr_sc.h" + +namespace kleidicv::sve2 { + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, + size_t y_end) { + return kleidicv_scharr_interleaved_stripe_s16_u8_sc( + src, src_stride, src_width, src_height, src_channels, dst, dst_stride, + y_begin, y_end); +} + +} // namespace kleidicv::sve2 -- GitLab