From d26597b42f902bd84d847b1bfd2ce1e95ad87063 Mon Sep 17 00:00:00 2001 From: Michael Platings Date: Tue, 20 Aug 2024 08:53:58 +0000 Subject: [PATCH] Remove multithreading for memory-bound operations It was found that enabling multithreading for these functions did not significantly improve performance compared to the single-threaded equivalents. --- CHANGELOG.md | 5 --- adapters/opencv/kleidicv_hal.cpp | 11 +++--- adapters/opencv/kleidicv_hal.h | 68 +++++++++++++++----------------- 3 files changed, 37 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb369d921..867d936f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,11 +43,6 @@ This changelog aims to follow the guiding principles of * convertTo * exp * compare - * add - * sub - * mul - * absdiff - * bitwise_and * minMaxIdx * GaussianBlur * Sobel diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 81bedaadb..b0b779254 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -68,7 +68,7 @@ static kleidicv_error_t parallel(kleidicv_thread_callback callback, return shared_result; } -kleidicv_thread_multithreading get_multithreading() { +static kleidicv_thread_multithreading get_multithreading() { return kleidicv_thread_multithreading{parallel, nullptr}; } @@ -120,14 +120,13 @@ int bgr_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, if (scn == 4 && dcn == 4) { if (swapBlue) { - return convert_error(kleidicv_thread_rgba_to_bgra_u8( + return convert_error(kleidicv_rgba_to_bgra_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height, - mt)); + reinterpret_cast(dst_data), dst_step, width, height)); } - return convert_error(kleidicv_thread_rgba_to_rgba_u8( + return convert_error(kleidicv_rgba_to_rgba_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height, mt)); + reinterpret_cast(dst_data), dst_step, width, height)); } } diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index ea6beaf85..2505922cb 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -22,8 +22,6 @@ namespace hal { // Macros to shorten repeated code. #define KLEIDICV_HAL_API(api) (kleidicv::hal::api) -kleidicv_thread_multithreading get_multithreading(); - int gray_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int dcn); @@ -140,11 +138,9 @@ namespace cv { // If the KleidiCV function has a signature matching the OpenCV HAL interface // AND it never returns KLEIDICV_NOT_IMPLEMENTED then we can call it directly // and convert the return code. -#define KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_impl, ...) \ - (kleidicv_thread_impl(__VA_ARGS__, kleidicv::hal::get_multithreading()) == \ - KLEIDICV_OK \ - ? CV_HAL_ERROR_OK \ - : CV_HAL_ERROR_UNKNOWN) +#define KLEIDICV_HAL_FORWARD(kleidicv_impl, ...) \ + (kleidicv_impl(__VA_ARGS__) == KLEIDICV_OK ? CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_UNKNOWN) #define KLEIDICV_HAL_FALLBACK_FORWARD(kleidicv_impl, fallback_hal_impl, ...) \ (KLEIDICV_HAL_API(kleidicv_impl)(__VA_ARGS__) == CV_HAL_ERROR_OK \ @@ -441,63 +437,63 @@ static inline int kleidicv_compare_u8_with_fallback( // clang-format off #undef cv_hal_add8s -#define cv_hal_add8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_s8, __VA_ARGS__) +#define cv_hal_add8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s8, __VA_ARGS__) #undef cv_hal_add8u -#define cv_hal_add8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_u8, __VA_ARGS__) +#define cv_hal_add8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u8, __VA_ARGS__) #undef cv_hal_add16s -#define cv_hal_add16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_s16, __VA_ARGS__) +#define cv_hal_add16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s16, __VA_ARGS__) #undef cv_hal_add16u -#define cv_hal_add16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_u16, __VA_ARGS__) +#define cv_hal_add16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u16, __VA_ARGS__) #undef cv_hal_sub8s -#define cv_hal_sub8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_s8, __VA_ARGS__) +#define cv_hal_sub8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s8, __VA_ARGS__) #undef cv_hal_sub8u -#define cv_hal_sub8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_u8, __VA_ARGS__) +#define cv_hal_sub8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u8, __VA_ARGS__) #undef cv_hal_sub16s -#define cv_hal_sub16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_s16, __VA_ARGS__) +#define cv_hal_sub16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s16, __VA_ARGS__) #undef cv_hal_sub16u -#define cv_hal_sub16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_u16, __VA_ARGS__) +#define cv_hal_sub16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u16, __VA_ARGS__) #undef cv_hal_absdiff8s -#define cv_hal_absdiff8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_s8, __VA_ARGS__) +#define cv_hal_absdiff8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s8, __VA_ARGS__) #undef cv_hal_absdiff8u -#define cv_hal_absdiff8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_u8, __VA_ARGS__) +#define cv_hal_absdiff8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u8, __VA_ARGS__) #undef cv_hal_absdiff16s -#define cv_hal_absdiff16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_s16, __VA_ARGS__) +#define cv_hal_absdiff16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s16, __VA_ARGS__) #undef cv_hal_absdiff16u -#define cv_hal_absdiff16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_u16, __VA_ARGS__) +#define cv_hal_absdiff16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u16, __VA_ARGS__) #undef cv_hal_and8u -#define cv_hal_and8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_bitwise_and, __VA_ARGS__) +#define cv_hal_and8u(...) KLEIDICV_HAL_FORWARD(kleidicv_bitwise_and, __VA_ARGS__) // clang-format on -#define KLEIDICV_HAL_MUL(suffix, kleidicv_impl, T) \ - static inline int kleidicv_##suffix##_with_fallback( \ - const T *src_a, size_t src_a_stride, const T *src_b, \ - size_t src_b_stride, T *dst, size_t dst_stride, size_t width, \ - size_t height, double scale) { \ - if (scale != 1.0) { \ - return cv_hal_##suffix(src_a, src_a_stride, src_b, src_b_stride, dst, \ - dst_stride, width, height, scale); \ - } \ - return KLEIDICV_THREAD_HAL_FORWARD(kleidicv_impl, src_a, src_a_stride, \ - src_b, src_b_stride, dst, dst_stride, \ - width, height, scale); \ +#define KLEIDICV_HAL_MUL(suffix, kleidicv_impl, T) \ + static inline int kleidicv_##suffix##_with_fallback( \ + const T *src_a, size_t src_a_stride, const T *src_b, \ + size_t src_b_stride, T *dst, size_t dst_stride, size_t width, \ + size_t height, double scale) { \ + if (scale != 1.0) { \ + return cv_hal_##suffix(src_a, src_a_stride, src_b, src_b_stride, dst, \ + dst_stride, width, height, scale); \ + } \ + return KLEIDICV_HAL_FORWARD(kleidicv_impl, src_a, src_a_stride, src_b, \ + src_b_stride, dst, dst_stride, width, height, \ + scale); \ } -KLEIDICV_HAL_MUL(mul8u, kleidicv_thread_saturating_multiply_u8, uint8_t); +KLEIDICV_HAL_MUL(mul8u, kleidicv_saturating_multiply_u8, uint8_t); #undef cv_hal_mul8u #define cv_hal_mul8u kleidicv_mul8u_with_fallback -KLEIDICV_HAL_MUL(mul8s, kleidicv_thread_saturating_multiply_s8, int8_t); +KLEIDICV_HAL_MUL(mul8s, kleidicv_saturating_multiply_s8, int8_t); #undef cv_hal_mul8s #define cv_hal_mul8s kleidicv_mul8s_with_fallback -KLEIDICV_HAL_MUL(mul16u, kleidicv_thread_saturating_multiply_u16, uint16_t); +KLEIDICV_HAL_MUL(mul16u, kleidicv_saturating_multiply_u16, uint16_t); #undef cv_hal_mul16u #define cv_hal_mul16u kleidicv_mul16u_with_fallback -KLEIDICV_HAL_MUL(mul16s, kleidicv_thread_saturating_multiply_s16, int16_t); +KLEIDICV_HAL_MUL(mul16s, kleidicv_saturating_multiply_s16, int16_t); #undef cv_hal_mul16s #define cv_hal_mul16s kleidicv_mul16s_with_fallback -- GitLab