diff --git a/CHANGELOG.md b/CHANGELOG.md index e014dfea25c8e187f679fbba75bcffd4f339a125..15f628c5db436d1a59b3af59069a6c1776618b0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,8 +36,18 @@ This changelog aims to follow the guiding principles of ### Changed - Filter context creation API specification. - Gaussian Blur API specification. -- In the OpenCV HAL, cvtColor YUV2RGB_NV21 is multithreaded. -- In the OpenCV HAL, minMaxIdx is multithreaded. +- In the OpenCV HAL, the following operations are multithreaded: + * cvtColor + * threshold + * convertTo + * exp + * compare + * add + * sub + * mul + * absdiff + * bitwise_and + * minMaxIdx - Improved performance of Compare Equal and Greater SC API. ### Removed diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 3b3b49c7e6bfdca2cfa720c44ef09a9e11351d53..807c8e324ff5a26b1d663b564bca188e1a1b5f2d 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -68,7 +68,7 @@ static kleidicv_error_t parallel(kleidicv_thread_callback callback, return shared_result; } -static kleidicv_thread_multithreading get_multithreading() { +kleidicv_thread_multithreading get_multithreading() { return kleidicv_thread_multithreading{parallel, nullptr}; } @@ -79,15 +79,17 @@ int gray_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } + auto mt = get_multithreading(); + if (depth == CV_8U) { if (dcn == 3) { - return convert_error(kleidicv_gray_to_rgb_u8( + return convert_error(kleidicv_thread_gray_to_rgb_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } - return convert_error(kleidicv_gray_to_rgba_u8( + return convert_error(kleidicv_thread_gray_to_rgba_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -101,27 +103,31 @@ int bgr_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } + auto mt = get_multithreading(); + if (depth == CV_8U) { if (scn == 3 && dcn == 3) { if (swapBlue) { - return convert_error(kleidicv_rgb_to_bgr_u8( + return convert_error(kleidicv_thread_rgb_to_bgr_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, + mt)); } - return convert_error(kleidicv_rgb_to_rgb_u8( + return convert_error(kleidicv_thread_rgb_to_rgb_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } if (scn == 4 && dcn == 4) { if (swapBlue) { - return convert_error(kleidicv_rgba_to_bgra_u8( + return convert_error(kleidicv_thread_rgba_to_bgra_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, + mt)); } - return convert_error(kleidicv_rgba_to_rgba_u8( + return convert_error(kleidicv_thread_rgba_to_rgba_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } } @@ -144,34 +150,36 @@ int yuv_to_bgr_sp_ex(const uchar *y_data, size_t y_step, const uchar *uv_data, const bool is_bgr = !swapBlue; const bool is_nv21 = (uIdx != 0); + auto mt = get_multithreading(); + if (dcn == 3) { if (is_bgr) { - return convert_error(kleidicv_yuv_sp_to_bgr_u8( + return convert_error(kleidicv_thread_yuv_sp_to_bgr_u8( reinterpret_cast(y_data), y_step, reinterpret_cast(uv_data), uv_step, reinterpret_cast(dst_data), dst_step, dst_width, - dst_height, is_nv21)); + dst_height, is_nv21, mt)); } return convert_error(kleidicv_thread_yuv_sp_to_rgb_u8( reinterpret_cast(y_data), y_step, reinterpret_cast(uv_data), uv_step, reinterpret_cast(dst_data), dst_step, dst_width, dst_height, - is_nv21, get_multithreading())); + is_nv21, mt)); } if (dcn == 4) { if (is_bgr) { - return convert_error(kleidicv_yuv_sp_to_bgra_u8( + return convert_error(kleidicv_thread_yuv_sp_to_bgra_u8( reinterpret_cast(y_data), y_step, reinterpret_cast(uv_data), uv_step, reinterpret_cast(dst_data), dst_step, dst_width, - dst_height, is_nv21)); + dst_height, is_nv21, mt)); } - return convert_error(kleidicv_yuv_sp_to_rgba_u8( + return convert_error(kleidicv_thread_yuv_sp_to_rgba_u8( reinterpret_cast(y_data), y_step, reinterpret_cast(uv_data), uv_step, reinterpret_cast(dst_data), dst_step, dst_width, dst_height, - is_nv21)); + is_nv21, mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -186,16 +194,18 @@ int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } + auto mt = get_multithreading(); + if (is_bgr) { - return convert_error(kleidicv_yuv_to_bgr_u8( + return convert_error(kleidicv_thread_yuv_to_bgr_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, - static_cast(width), static_cast(height))); + static_cast(width), static_cast(height), mt)); } - return convert_error(kleidicv_yuv_to_rgb_u8( + return convert_error(kleidicv_thread_yuv_to_rgb_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, - static_cast(width), static_cast(height))); + static_cast(width), static_cast(height), mt)); } int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data, @@ -207,30 +217,32 @@ int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } + auto mt = get_multithreading(); + if (scn == 3) { if (is_bgr) { - return convert_error(kleidicv_bgr_to_yuv_u8( + return convert_error(kleidicv_thread_bgr_to_yuv_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, - static_cast(width), static_cast(height))); + static_cast(width), static_cast(height), mt)); } - return convert_error(kleidicv_rgb_to_yuv_u8( + return convert_error(kleidicv_thread_rgb_to_yuv_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, - static_cast(width), static_cast(height))); + static_cast(width), static_cast(height), mt)); } if (scn == 4) { if (is_bgr) { - return convert_error(kleidicv_bgra_to_yuv_u8( + return convert_error(kleidicv_thread_bgra_to_yuv_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, - static_cast(width), static_cast(height))); + static_cast(width), static_cast(height), mt)); } - return convert_error(kleidicv_rgba_to_yuv_u8( + return convert_error(kleidicv_thread_rgba_to_yuv_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, - static_cast(width), static_cast(height))); + static_cast(width), static_cast(height), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -239,12 +251,15 @@ int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data, int threshold(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType) { + auto mt = get_multithreading(); + if ((depth == CV_8U) && (thresholdType == 0 /* THRESH_BINARY */)) { size_t width_in_elements = width * cn; - return convert_error(kleidicv_threshold_binary_u8( + return convert_error(kleidicv_thread_threshold_binary_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width_in_elements, - height, static_cast(thresh), static_cast(maxValue))); + height, static_cast(thresh), static_cast(maxValue), + mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; @@ -929,6 +944,8 @@ int min_max_idx(const uchar *src_data, size_t src_step, int width, int height, int convertTo(const uchar *src_data, size_t src_step, int src_depth, uchar *dst_data, size_t dst_step, int dst_depth, int width, int height, double scale, double shift) { + auto mt = get_multithreading(); + // scaling only if (src_depth == dst_depth) { // no scaling, no advantage @@ -939,15 +956,15 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth, switch (src_depth) { case CV_8U: - return convert_error(kleidicv_scale_u8( + return convert_error(kleidicv_thread_scale_u8( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, height, - static_cast(scale), static_cast(shift))); + static_cast(scale), static_cast(shift), mt)); case CV_32F: - return convert_error(kleidicv_scale_f32( + return convert_error(kleidicv_thread_scale_f32( reinterpret_cast(src_data), src_step, reinterpret_cast(dst_data), dst_step, width, height, - static_cast(scale), static_cast(shift))); + static_cast(scale), static_cast(shift), mt)); default: break; } @@ -957,45 +974,49 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth, if (scale == 1.0 && shift == 0.0) { // float32 to int8 if (src_depth == CV_32F && dst_depth == CV_8S) { - return convert_error(kleidicv_float_conversion_f32_s8( + return convert_error(kleidicv_thread_float_conversion_f32_s8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } // float32 to uint8 if (src_depth == CV_32F && dst_depth == CV_8U) { - return convert_error(kleidicv_float_conversion_f32_u8( + return convert_error(kleidicv_thread_float_conversion_f32_u8( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } // int8 to float32 if (src_depth == CV_8S && dst_depth == CV_32F) { - return convert_error(kleidicv_float_conversion_s8_f32( + return convert_error(kleidicv_thread_float_conversion_s8_f32( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } // uint8 to float32 if (src_depth == CV_8U && dst_depth == CV_32F) { - return convert_error(kleidicv_float_conversion_u8_f32( + return convert_error(kleidicv_thread_float_conversion_u8_f32( reinterpret_cast(src_data), src_step, - reinterpret_cast(dst_data), dst_step, width, height)); + reinterpret_cast(dst_data), dst_step, width, height, mt)); } } return CV_HAL_ERROR_NOT_IMPLEMENTED; } int exp32f(const float *src, float *dst, int len) { - return convert_error(kleidicv_exp_f32(src, len * sizeof(float), dst, - len * sizeof(float), len, 1)); + auto mt = get_multithreading(); + + return convert_error(kleidicv_thread_exp_f32( + src, len * sizeof(float), dst, len * sizeof(float), len, 1, mt)); } int compare_u8(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { + auto mt = get_multithreading(); + switch (operation) { case cv::CMP_GT: - return convert_error(kleidicv_compare_greater_u8( + return convert_error(kleidicv_thread_compare_greater_u8( src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, - height)); + height, mt)); default: return CV_HAL_ERROR_NOT_IMPLEMENTED; } diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index 6cc6af83171172293441c31a572c82d307f15cf8..ea6beaf857fdc6839035c2ce957c25026202fb50 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -9,6 +9,7 @@ #include #include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" #include "opencv2/core/hal/interface.h" // Forward declarations of OpenCV internals. @@ -21,6 +22,8 @@ namespace hal { // Macros to shorten repeated code. #define KLEIDICV_HAL_API(api) (kleidicv::hal::api) +kleidicv_thread_multithreading get_multithreading(); + int gray_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int dcn); @@ -137,9 +140,11 @@ namespace cv { // If the KleidiCV function has a signature matching the OpenCV HAL interface // AND it never returns KLEIDICV_NOT_IMPLEMENTED then we can call it directly // and convert the return code. -#define KLEIDICV_HAL_FORWARD(kleidicv_impl, ...) \ - (kleidicv_impl(__VA_ARGS__) == KLEIDICV_OK ? CV_HAL_ERROR_OK \ - : CV_HAL_ERROR_UNKNOWN) +#define KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_impl, ...) \ + (kleidicv_thread_impl(__VA_ARGS__, kleidicv::hal::get_multithreading()) == \ + KLEIDICV_OK \ + ? CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_UNKNOWN) #define KLEIDICV_HAL_FALLBACK_FORWARD(kleidicv_impl, fallback_hal_impl, ...) \ (KLEIDICV_HAL_API(kleidicv_impl)(__VA_ARGS__) == CV_HAL_ERROR_OK \ @@ -436,63 +441,63 @@ static inline int kleidicv_compare_u8_with_fallback( // clang-format off #undef cv_hal_add8s -#define cv_hal_add8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s8, __VA_ARGS__) +#define cv_hal_add8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_s8, __VA_ARGS__) #undef cv_hal_add8u -#define cv_hal_add8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u8, __VA_ARGS__) +#define cv_hal_add8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_u8, __VA_ARGS__) #undef cv_hal_add16s -#define cv_hal_add16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s16, __VA_ARGS__) +#define cv_hal_add16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_s16, __VA_ARGS__) #undef cv_hal_add16u -#define cv_hal_add16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u16, __VA_ARGS__) +#define cv_hal_add16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_u16, __VA_ARGS__) #undef cv_hal_sub8s -#define cv_hal_sub8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s8, __VA_ARGS__) +#define cv_hal_sub8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_s8, __VA_ARGS__) #undef cv_hal_sub8u -#define cv_hal_sub8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u8, __VA_ARGS__) +#define cv_hal_sub8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_u8, __VA_ARGS__) #undef cv_hal_sub16s -#define cv_hal_sub16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s16, __VA_ARGS__) +#define cv_hal_sub16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_s16, __VA_ARGS__) #undef cv_hal_sub16u -#define cv_hal_sub16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u16, __VA_ARGS__) +#define cv_hal_sub16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_u16, __VA_ARGS__) #undef cv_hal_absdiff8s -#define cv_hal_absdiff8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s8, __VA_ARGS__) +#define cv_hal_absdiff8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_s8, __VA_ARGS__) #undef cv_hal_absdiff8u -#define cv_hal_absdiff8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u8, __VA_ARGS__) +#define cv_hal_absdiff8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_u8, __VA_ARGS__) #undef cv_hal_absdiff16s -#define cv_hal_absdiff16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s16, __VA_ARGS__) +#define cv_hal_absdiff16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_s16, __VA_ARGS__) #undef cv_hal_absdiff16u -#define cv_hal_absdiff16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u16, __VA_ARGS__) +#define cv_hal_absdiff16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_u16, __VA_ARGS__) #undef cv_hal_and8u -#define cv_hal_and8u(...) KLEIDICV_HAL_FORWARD(kleidicv_bitwise_and, __VA_ARGS__) +#define cv_hal_and8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_bitwise_and, __VA_ARGS__) // clang-format on -#define KLEIDICV_HAL_MUL(suffix, kleidicv_impl, T) \ - static inline int kleidicv_##suffix##_with_fallback( \ - const T *src_a, size_t src_a_stride, const T *src_b, \ - size_t src_b_stride, T *dst, size_t dst_stride, size_t width, \ - size_t height, double scale) { \ - if (scale != 1.0) { \ - return cv_hal_##suffix(src_a, src_a_stride, src_b, src_b_stride, dst, \ - dst_stride, width, height, scale); \ - } \ - return KLEIDICV_HAL_FORWARD(kleidicv_impl, src_a, src_a_stride, src_b, \ - src_b_stride, dst, dst_stride, width, height, \ - scale); \ +#define KLEIDICV_HAL_MUL(suffix, kleidicv_impl, T) \ + static inline int kleidicv_##suffix##_with_fallback( \ + const T *src_a, size_t src_a_stride, const T *src_b, \ + size_t src_b_stride, T *dst, size_t dst_stride, size_t width, \ + size_t height, double scale) { \ + if (scale != 1.0) { \ + return cv_hal_##suffix(src_a, src_a_stride, src_b, src_b_stride, dst, \ + dst_stride, width, height, scale); \ + } \ + return KLEIDICV_THREAD_HAL_FORWARD(kleidicv_impl, src_a, src_a_stride, \ + src_b, src_b_stride, dst, dst_stride, \ + width, height, scale); \ } -KLEIDICV_HAL_MUL(mul8u, kleidicv_saturating_multiply_u8, uint8_t); +KLEIDICV_HAL_MUL(mul8u, kleidicv_thread_saturating_multiply_u8, uint8_t); #undef cv_hal_mul8u #define cv_hal_mul8u kleidicv_mul8u_with_fallback -KLEIDICV_HAL_MUL(mul8s, kleidicv_saturating_multiply_s8, int8_t); +KLEIDICV_HAL_MUL(mul8s, kleidicv_thread_saturating_multiply_s8, int8_t); #undef cv_hal_mul8s #define cv_hal_mul8s kleidicv_mul8s_with_fallback -KLEIDICV_HAL_MUL(mul16u, kleidicv_saturating_multiply_u16, uint16_t); +KLEIDICV_HAL_MUL(mul16u, kleidicv_thread_saturating_multiply_u16, uint16_t); #undef cv_hal_mul16u #define cv_hal_mul16u kleidicv_mul16u_with_fallback -KLEIDICV_HAL_MUL(mul16s, kleidicv_saturating_multiply_s16, int16_t); +KLEIDICV_HAL_MUL(mul16s, kleidicv_thread_saturating_multiply_s16, int16_t); #undef cv_hal_mul16s #define cv_hal_mul16s kleidicv_mul16s_with_fallback diff --git a/doc/opencv.md b/doc/opencv.md index 0d0b3de03ca7e7c10f579bbacbc67b2f8cac371d..4d840bf46428ce17fde1615b89f0dbc849fdae81 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -14,9 +14,19 @@ For details of building OpenCV with KleidiCV see [the build documentation](build For single-threaded use cases, enabling the KleidiCV OpenCV HAL is likely to provide a performance boost for the functions that it implements. -Multithreading is planned for KleidiCV but at present it is single-threaded -only. Therefore it is recommended to check the performance yourself in order -to decide whether to enable KleidiCV in a multicore environment. +The same is true for multi-threaded use cases, with the exception of the +following functions: +* dilate +* erode +* GaussianBlur +* resize +* sepFilter2D +* Sobel +* transpose +Multithreading is planned for these functions in KleidiCV but at present they +are single-threaded only. Therefore it is recommended to check the performance +yourself in order to decide whether to enable KleidiCV in a multicore +environment. ## Functionality in KleidiCV OpenCV HAL diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 5a1f0f863d92cda55912e04524f898c1a34cc648..ae0c04e0b3606a511275ff681d81cedf37fb17ac 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -46,15 +46,61 @@ typedef struct { void *parallel_data; } kleidicv_thread_multithreading; +#define KLEIDICV_THREAD_UNARY_OP(name, src_type, dst_type) \ + kleidicv_error_t name(const src_type *src, size_t src_stride, dst_type *dst, \ + size_t dst_stride, size_t width, size_t height, \ + kleidicv_thread_multithreading) + +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_gray_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_gray_to_rgba_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_bgr_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_bgra_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_rgba_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_bgra_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_rgba_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_bgr_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_yuv_to_bgr_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_yuv_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_bgr_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_bgra_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_exp_f32, float, float); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_f32_s8, float, + int8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_f32_u8, float, + uint8_t); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_s8_f32, int8_t, + float); +KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_u8_f32, uint8_t, + float); + /// Internal - not part of the public API and its direct use is not supported. /// -/// Multithreaded implementation of kleidicv_yuv_sp_to_rgb_u8 - see the +/// Multithreaded implementation of kleidicv_yuv_sp_to_bgr_u8 - see the /// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_yuv_sp_to_bgr_u8( + const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, + size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, + size_t height, bool is_nv21, kleidicv_thread_multithreading); + +kleidicv_error_t kleidicv_thread_yuv_sp_to_bgra_u8( + const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, + size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, + size_t height, bool is_nv21, kleidicv_thread_multithreading); + kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); +kleidicv_error_t kleidicv_thread_yuv_sp_to_rgba_u8( + const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, + size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, + size_t height, bool is_nv21, kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_min_max_u8 - see the @@ -123,6 +169,75 @@ kleidicv_error_t kleidicv_thread_min_max_loc_u8( const uint8_t *src, size_t src_stride, size_t width, size_t height, size_t *min_offset, size_t *max_offset, kleidicv_thread_multithreading); +kleidicv_error_t kleidicv_thread_threshold_binary_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, uint8_t threshold, uint8_t value, + kleidicv_thread_multithreading); + +kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + float scale, float shift, + kleidicv_thread_multithreading); + +kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride, + float *dst, size_t dst_stride, + size_t width, size_t height, + float scale, float shift, + kleidicv_thread_multithreading); + +#define KLEIDICV_THREAD_BINARY_OP(name, type) \ + kleidicv_error_t name(const type *src_a, size_t src_a_stride, \ + const type *src_b, size_t src_b_stride, type *dst, \ + size_t dst_stride, size_t width, size_t height, \ + kleidicv_thread_multithreading) + +#define KLEIDICV_THREAD_BINARY_OP_SCALE(name, type, scaletype) \ + kleidicv_error_t name(const type *src_a, size_t src_a_stride, \ + const type *src_b, size_t src_b_stride, type *dst, \ + size_t dst_stride, size_t width, size_t height, \ + scaletype scale, kleidicv_thread_multithreading) + +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s8, int8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s16, int16_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u16, uint16_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s32, int32_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u32, uint32_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s64, int64_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u64, uint64_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s8, int8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s16, int16_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u16, uint16_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s32, int32_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u32, uint32_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s64, int64_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u64, uint64_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_s8, int8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_u16, uint16_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_s16, int16_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_s32, int32_t); +KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_u8, uint8_t, + double); +KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_s8, int8_t, + double); +KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_u16, + uint16_t, double); +KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_s16, + int16_t, double); +KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_s32, + int32_t, double); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_bitwise_and, uint8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_compare_equal_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_compare_greater_u8, uint8_t); + +kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16( + const int16_t *src_a, size_t src_a_stride, const int16_t *src_b, + size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width, + size_t height, int16_t threshold, kleidicv_thread_multithreading); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 96de2d333eb1bdd606027bdc223b7d13e123725f..70107a5091bc699f2848712eae08c41df3ee0947 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -5,50 +5,201 @@ #include "kleidicv_thread/kleidicv_thread.h" #include +#include #include #include #include "kleidicv/kleidicv.h" -struct kleidicv_thread_yuv_sp_to_rgb_u8_data { - const uint8_t *src_y; - size_t src_y_stride; - const uint8_t *src_uv; - size_t src_uv_stride; - uint8_t *dst; - size_t dst_stride; - size_t width; - size_t height; - bool is_nv21; -}; +typedef std::function FunctionCallback; -static kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_callback( - unsigned task_begin, unsigned task_end, void *void_data) { - auto *data = - reinterpret_cast(void_data); +static kleidicv_error_t kleidicv_thread_std_function_callback( + unsigned task_begin, unsigned task_end, void *data) { + auto *callback = reinterpret_cast(data); + return (*callback)(task_begin, task_end); +} + +template +inline kleidicv_error_t kleidicv_thread_unary_op_impl( + F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride, + DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) { + FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) { + return f(src + task_begin * src_stride / sizeof(SrcT), src_stride, + dst + task_begin * dst_stride / sizeof(DstT), dst_stride, width, + task_end - task_begin, args...); + }; + return mt.parallel(kleidicv_thread_std_function_callback, &callback, + mt.parallel_data, height); +} - size_t row_begin = size_t{task_begin} * 2; - size_t row_end = std::min(data->height, size_t{task_end} * 2); - size_t row_uv = task_begin; +template +inline kleidicv_error_t kleidicv_thread_binary_op_impl( + F f, kleidicv_thread_multithreading mt, const SrcT *src_a, + size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst, + size_t dst_stride, size_t width, size_t height, Args... args) { + FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) { + return f(src_a + task_begin * src_a_stride / sizeof(SrcT), src_a_stride, + src_b + task_begin * src_b_stride / sizeof(SrcT), src_b_stride, + dst + task_begin * dst_stride / sizeof(DstT), dst_stride, width, + task_end - task_begin, args...); + }; + return mt.parallel(kleidicv_thread_std_function_callback, &callback, + mt.parallel_data, height); +} + +#define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type) \ + kleidicv_error_t kleidicv_thread_##suffix( \ + const src_type *src, size_t src_stride, dst_type *dst, \ + size_t dst_stride, size_t width, size_t height, \ + kleidicv_thread_multithreading mt) { \ + return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \ + src_stride, dst, dst_stride, width, \ + height); \ + } + +KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgr_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgb_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(bgr_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(bgra_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_yuv_u8, uint8_t, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float); +KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_f32_s8, float, int8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_f32_u8, float, uint8_t); +KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_s8_f32, int8_t, float); +KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_u8_f32, uint8_t, float); + +kleidicv_error_t kleidicv_thread_threshold_binary_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, uint8_t threshold, uint8_t value, + kleidicv_thread_multithreading mt) { + return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src, + src_stride, dst, dst_stride, width, + height, threshold, value); +} - return kleidicv_yuv_sp_to_rgb_u8( - data->src_y + row_begin * data->src_y_stride, data->src_y_stride, - data->src_uv + row_uv * data->src_uv_stride, data->src_uv_stride, - data->dst + row_begin * data->dst_stride, data->dst_stride, data->width, - row_end - row_begin, data->is_nv21); +kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + float scale, float shift, + kleidicv_thread_multithreading mt) { + return kleidicv_thread_unary_op_impl(kleidicv_scale_u8, mt, src, src_stride, + dst, dst_stride, width, height, scale, + shift); } -kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8( - const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, +kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride, + float *dst, size_t dst_stride, + size_t width, size_t height, + float scale, float shift, + kleidicv_thread_multithreading mt) { + return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride, + dst, dst_stride, width, height, scale, + shift); +} + +#define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type) \ + kleidicv_error_t kleidicv_thread_##suffix( \ + const type *src_a, size_t src_a_stride, const type *src_b, \ + size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ + size_t height, kleidicv_thread_multithreading mt) { \ + return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a, \ + src_a_stride, src_b, src_b_stride, \ + dst, dst_stride, width, height); \ + } + +#define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype) \ + kleidicv_error_t kleidicv_thread_##suffix( \ + const type *src_a, size_t src_a_stride, const type *src_b, \ + size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ + size_t height, scaletype scale, kleidicv_thread_multithreading mt) { \ + return kleidicv_thread_binary_op_impl( \ + kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \ + dst_stride, width, height, scale); \ + } + +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t); +KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double); +KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double); +KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double); +KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double); +KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double); +KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t); +KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t); + +kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16( + const int16_t *src_a, size_t src_a_stride, const int16_t *src_b, + size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width, + size_t height, int16_t threshold, kleidicv_thread_multithreading mt) { + return kleidicv_thread_binary_op_impl( + kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride, + src_b, src_b_stride, dst, dst_stride, width, height, threshold); +} + +template +inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl( + F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { - kleidicv_thread_yuv_sp_to_rgb_u8_data callback_data = { - src_y, src_y_stride, src_uv, src_uv_stride, dst, - dst_stride, width, height, is_nv21}; - return mt.parallel(kleidicv_thread_yuv_sp_to_rgb_u8_callback, &callback_data, + FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) { + size_t row_begin = size_t{task_begin} * 2; + size_t row_end = std::min(height, size_t{task_end} * 2); + size_t row_uv = task_begin; + return f(src_y + row_begin * src_y_stride, src_y_stride, + src_uv + row_uv * src_uv_stride, src_uv_stride, + dst + row_begin * dst_stride, dst_stride, width, + row_end - row_begin, is_nv21); + }; + return mt.parallel(kleidicv_thread_std_function_callback, &callback, mt.parallel_data, (height + 1) / 2); } +#define YUV_SP_TO_RGB(suffix) \ + kleidicv_error_t kleidicv_thread_##suffix( \ + const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, \ + size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, \ + size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { \ + return kleidicv_thread_yuv_sp_to_rgb_u8_impl( \ + kleidicv_##suffix, src_y, src_y_stride, src_uv, src_uv_stride, dst, \ + dst_stride, width, height, is_nv21, mt); \ + } + +YUV_SP_TO_RGB(yuv_sp_to_bgr_u8); +YUV_SP_TO_RGB(yuv_sp_to_bgra_u8); +YUV_SP_TO_RGB(yuv_sp_to_rgb_u8); +YUV_SP_TO_RGB(yuv_sp_to_rgba_u8); + template struct parallel_min_max_data { FunctionType min_max_func; diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69d5534d4019247c0ddcd8d9d345bb1a83359d18 --- /dev/null +++ b/test/api/test_thread.cpp @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" + +// Tuple of width, height, thread count. +typedef std::tuple P; + +class Thread : public testing::TestWithParam

{ + public: + template + void check_unary_op(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, size_t src_channels, + size_t dst_channels, Args... args) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + test::Array2D src(size_t{width} * src_channels, height); + test::Array2D dst_single(size_t{width} * dst_channels, height), + dst_multi(size_t{width} * dst_channels, height); + + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + + kleidicv_error_t single_result = + single_threaded_func(src.data(), src.stride(), dst_single.data(), + dst_single.stride(), width, height, args...); + + kleidicv_error_t multi_result = multithreaded_func( + src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width, + height, args..., get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); + } + + template + void check_binary_op(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, + size_t src_channels, size_t dst_channels, Args... args) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + test::Array2D src_a(size_t{width} * src_channels, height), + src_b(size_t{width} * src_channels, height); + test::Array2D dst_single(size_t{width} * dst_channels, height), + dst_multi(size_t{width} * dst_channels, height); + + test::PseudoRandomNumberGenerator generator; + src_a.fill(generator); + src_b.fill(generator); + + kleidicv_error_t single_result = single_threaded_func( + src_a.data(), src_a.stride(), src_b.data(), src_b.stride(), + dst_single.data(), dst_single.stride(), width, height, args...); + + kleidicv_error_t multi_result = multithreaded_func( + src_a.data(), src_a.stride(), src_b.data(), src_b.stride(), + dst_multi.data(), dst_multi.stride(), width, height, args..., + get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); + } +}; + +#define TEST_UNARY_OP(suffix, SrcT, DstT, ...) \ + TEST_P(Thread, suffix) { \ + check_unary_op(kleidicv_##suffix, kleidicv_thread_##suffix, \ + __VA_ARGS__); \ + } + +#define TEST_BINARY_OP(suffix, T, ...) \ + TEST_P(Thread, suffix) { \ + check_binary_op(kleidicv_##suffix, kleidicv_thread_##suffix, \ + __VA_ARGS__); \ + } + +TEST_UNARY_OP(gray_to_rgb_u8, uint8_t, uint8_t, 1, 3); +TEST_UNARY_OP(gray_to_rgba_u8, uint8_t, uint8_t, 1, 4); +TEST_UNARY_OP(rgb_to_bgr_u8, uint8_t, uint8_t, 3, 3); +TEST_UNARY_OP(rgb_to_rgb_u8, uint8_t, uint8_t, 3, 3); +TEST_UNARY_OP(rgba_to_bgra_u8, uint8_t, uint8_t, 4, 4); +TEST_UNARY_OP(rgba_to_rgba_u8, uint8_t, uint8_t, 4, 4); +TEST_UNARY_OP(rgb_to_bgra_u8, uint8_t, uint8_t, 3, 4); +TEST_UNARY_OP(rgb_to_rgba_u8, uint8_t, uint8_t, 3, 4); +TEST_UNARY_OP(rgba_to_bgr_u8, uint8_t, uint8_t, 4, 3); +TEST_UNARY_OP(rgba_to_rgb_u8, uint8_t, uint8_t, 4, 3); +TEST_UNARY_OP(yuv_to_bgr_u8, uint8_t, uint8_t, 3, 3); +TEST_UNARY_OP(yuv_to_rgb_u8, uint8_t, uint8_t, 3, 3); +TEST_UNARY_OP(bgr_to_yuv_u8, uint8_t, uint8_t, 3, 3); +TEST_UNARY_OP(rgb_to_yuv_u8, uint8_t, uint8_t, 3, 3); +TEST_UNARY_OP(bgra_to_yuv_u8, uint8_t, uint8_t, 4, 3); +TEST_UNARY_OP(rgba_to_yuv_u8, uint8_t, uint8_t, 4, 3); +TEST_UNARY_OP(threshold_binary_u8, uint8_t, uint8_t, 1, 1, 100, 200); +TEST_UNARY_OP(scale_u8, uint8_t, uint8_t, 1, 1, 0.5F, 3.5F); +TEST_UNARY_OP(scale_f32, float, float, 1, 1, 0.123F, 45.6789F); +TEST_UNARY_OP(exp_f32, float, float, 1, 1); +TEST_UNARY_OP(float_conversion_f32_s8, float, int8_t, 1, 1); +TEST_UNARY_OP(float_conversion_f32_u8, float, uint8_t, 1, 1); +TEST_UNARY_OP(float_conversion_s8_f32, int8_t, float, 1, 1); +TEST_UNARY_OP(float_conversion_u8_f32, uint8_t, float, 1, 1); + +TEST_BINARY_OP(saturating_add_s8, int8_t, 1, 1); +TEST_BINARY_OP(saturating_add_u8, uint8_t, 1, 1); +TEST_BINARY_OP(saturating_add_s16, int16_t, 1, 1); +TEST_BINARY_OP(saturating_add_u16, uint16_t, 1, 1); +TEST_BINARY_OP(saturating_add_s32, int32_t, 1, 1); +TEST_BINARY_OP(saturating_add_u32, uint32_t, 1, 1); +TEST_BINARY_OP(saturating_add_s64, int64_t, 1, 1); +TEST_BINARY_OP(saturating_add_u64, uint64_t, 1, 1); +TEST_BINARY_OP(saturating_sub_s8, int8_t, 1, 1); +TEST_BINARY_OP(saturating_sub_u8, uint8_t, 1, 1); +TEST_BINARY_OP(saturating_sub_s16, int16_t, 1, 1); +TEST_BINARY_OP(saturating_sub_u16, uint16_t, 1, 1); +TEST_BINARY_OP(saturating_sub_s32, int32_t, 1, 1); +TEST_BINARY_OP(saturating_sub_u32, uint32_t, 1, 1); +TEST_BINARY_OP(saturating_sub_s64, int64_t, 1, 1); +TEST_BINARY_OP(saturating_sub_u64, uint64_t, 1, 1); +TEST_BINARY_OP(saturating_absdiff_u8, uint8_t, 1, 1); +TEST_BINARY_OP(saturating_absdiff_s8, int8_t, 1, 1); +TEST_BINARY_OP(saturating_absdiff_u16, uint16_t, 1, 1); +TEST_BINARY_OP(saturating_absdiff_s16, int16_t, 1, 1); +TEST_BINARY_OP(saturating_absdiff_s32, int32_t, 1, 1); +TEST_BINARY_OP(saturating_multiply_u8, uint8_t, 1, 1, 1.23); +TEST_BINARY_OP(saturating_multiply_s8, int8_t, 1, 1, -2.34); +TEST_BINARY_OP(saturating_multiply_u16, uint16_t, 1, 1, 0.321); +TEST_BINARY_OP(saturating_multiply_s16, int16_t, 1, 1, -0.543); +TEST_BINARY_OP(saturating_multiply_s32, int32_t, 1, 1, -0.0123); +TEST_BINARY_OP(bitwise_and, uint8_t, 1, 1); +TEST_BINARY_OP(compare_equal_u8, uint8_t, 1, 1); +TEST_BINARY_OP(compare_greater_u8, uint8_t, 1, 1); +TEST_BINARY_OP(saturating_add_abs_with_threshold_s16, int16_t, 1, 1, 123); + +INSTANTIATE_TEST_SUITE_P(, Thread, + testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, + P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, + P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, + P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5})); diff --git a/test/api/test_thread_yuv_sp_to_rgb.cpp b/test/api/test_thread_yuv_sp_to_rgb.cpp index 1f3731df25f28cb4083341215c7f397fe513181a..a358edbd6b53797ab6419549d71b6d8c9d8bf7ed 100644 --- a/test/api/test_thread_yuv_sp_to_rgb.cpp +++ b/test/api/test_thread_yuv_sp_to_rgb.cpp @@ -16,35 +16,51 @@ // Tuple of width, height, thread count. typedef std::tuple P; -class Thread : public testing::TestWithParam

{}; - -TEST_P(Thread, _) { - unsigned width = 0, height = 0, thread_count = 0; - std::tie(width, height, thread_count) = GetParam(); - test::Array2D src_y(width, height), - src_uv((width + 1) & ~1, (height + 1) / 2), - dst_single(size_t{width} * 3, height), - dst_multi(size_t{width} * 3, height); - - test::PseudoRandomNumberGenerator generator; - src_y.fill(generator); - src_uv.fill(generator); - - kleidicv_error_t single_result = kleidicv_yuv_sp_to_rgb_u8( - src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), - dst_single.data(), dst_single.stride(), width, height, false); - - kleidicv_error_t multi_result = kleidicv_thread_yuv_sp_to_rgb_u8( - src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), - dst_multi.data(), dst_multi.stride(), width, height, false, - get_multithreading_fake(thread_count)); - - EXPECT_EQ(KLEIDICV_OK, single_result); - EXPECT_EQ(KLEIDICV_OK, multi_result); - EXPECT_EQ_ARRAY2D(dst_multi, dst_single); +class YuvSpThread : public testing::TestWithParam

{ + public: + template + void check(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, size_t channels) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + test::Array2D src_y(width, height), + src_uv((width + 1) & ~1, (height + 1) / 2), + dst_single(size_t{width} * channels, height), + dst_multi(size_t{width} * channels, height); + + test::PseudoRandomNumberGenerator generator; + src_y.fill(generator); + src_uv.fill(generator); + + kleidicv_error_t single_result = single_threaded_func( + src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), + dst_single.data(), dst_single.stride(), width, height, false); + + kleidicv_error_t multi_result = multithreaded_func( + src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), + dst_multi.data(), dst_multi.stride(), width, height, false, + get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); + } +}; + +TEST_P(YuvSpThread, ToBGR) { + check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3); +} +TEST_P(YuvSpThread, ToBGRA) { + check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4); +} +TEST_P(YuvSpThread, ToRGB) { + check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3); +} +TEST_P(YuvSpThread, ToRGBA) { + check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4); } -INSTANTIATE_TEST_SUITE_P(YuvSp, Thread, +INSTANTIATE_TEST_SUITE_P(, YuvSpThread, testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2},