diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 7fa9209f04bc6d085ac895ecffb3f650ec44adc7..9bd743fe153a7c50bc2e255bf7170a059f2925e8 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -281,6 +281,84 @@ int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } +int bgr_to_yuv420p(const uchar *src_data, size_t src_step, uchar *dst_data, + size_t dst_step, int width, int height, int scn, + bool swapBlue, int uIdx) { + const bool is_bgr = !swapBlue; + const bool is_nv21 = (uIdx != 1); + auto mt = get_multithreading(); + if (scn == 3) { + if (is_bgr) { + return convert_error(kleidicv_thread_bgr_to_yuv420p_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, + static_cast(width), static_cast(height), is_nv21, + mt)); + } + return convert_error(kleidicv_thread_rgb_to_yuv420p_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, + static_cast(width), static_cast(height), is_nv21, mt)); + } + + if (scn == 4) { + if (is_bgr) { + return convert_error(kleidicv_thread_bgra_to_yuv420p_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, + static_cast(width), static_cast(height), is_nv21, + mt)); + } + return convert_error(kleidicv_thread_rgba_to_yuv420p_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, + static_cast(width), static_cast(height), is_nv21, mt)); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +int bgr_to_yuv420sp(const uchar *src_data, size_t src_step, uchar *y_data, + size_t y_step, uchar *uv_data, size_t uv_step, int width, + int height, int scn, bool swapBlue, int uIdx) { + const bool is_bgr = !swapBlue; + const bool is_nv21 = (uIdx != 1); + auto mt = get_multithreading(); + if (scn == 3) { + if (is_bgr) { + return convert_error(kleidicv_thread_bgr_to_yuv420sp_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(y_data), y_step, + reinterpret_cast(uv_data), uv_step, + static_cast(width), static_cast(height), is_nv21, + mt)); + } + return convert_error(kleidicv_thread_rgb_to_yuv420sp_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(y_data), y_step, + reinterpret_cast(uv_data), uv_step, + static_cast(width), static_cast(height), is_nv21, mt)); + } + + if (scn == 4) { + if (is_bgr) { + return convert_error(kleidicv_thread_bgra_to_yuv420sp_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(y_data), y_step, + reinterpret_cast(uv_data), uv_step, + static_cast(width), static_cast(height), is_nv21, + mt)); + } + return convert_error(kleidicv_thread_rgba_to_yuv420sp_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(y_data), y_step, + reinterpret_cast(uv_data), uv_step, + static_cast(width), static_cast(height), is_nv21, mt)); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) { diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index 0c9cd5e6bab0587d616192cd22c9280df1d726ed..204260044e91f95ae26f33f8be395551402afb0a 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -29,6 +29,14 @@ int bgr_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue); +int bgr_to_yuv420p(const uchar *src_data, size_t src_step, uchar *dst_data, + size_t dst_step, int width, int height, int scn, + bool swapBlue, int uIdx); + +int bgr_to_yuv420sp(const uchar *src_data, size_t src_step, uchar *y_data, + size_t y_step, uchar *uv_data, size_t uv_step, int width, + int height, int scn, bool swapBlue, int uIdx); + int yuv_to_bgr_sp(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx); @@ -215,6 +223,29 @@ static inline int kleidicv_bgr_to_bgr_with_fallback( #undef cv_hal_cvtBGRtoBGR #define cv_hal_cvtBGRtoBGR kleidicv_bgr_to_bgr_with_fallback +static inline int kleidicv_bgr_to_yuv420_with_fallback( + const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, + int width, int height, int scn, bool swapBlue, int uIdx) { + return KLEIDICV_HAL_FALLBACK_FORWARD( + bgr_to_yuv420p, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, + dst_data, dst_step, width, height, scn, swapBlue, uIdx); +} + +#undef cv_hal_cvtBGRtoThreePlaneYUV +#define cv_hal_cvtBGRtoThreePlaneYUV kleidicv_bgr_to_yuv420_with_fallback + +static inline int kleidicv_bgr_to_yuv420sp_with_fallback( + const uchar *src_data, size_t src_step, uchar *y_data, size_t y_step, + uchar *uv_data, size_t uv_step, int width, int height, int scn, + bool swapBlue, int uIdx) { + return KLEIDICV_HAL_FALLBACK_FORWARD( + bgr_to_yuv420sp, cv_hal_cvtBGRtoTwoPlaneYUV, src_data, src_step, y_data, + y_step, uv_data, uv_step, width, height, scn, swapBlue, uIdx); +} + +#undef cv_hal_cvtBGRtoTwoPlaneYUV +#define cv_hal_cvtBGRtoTwoPlaneYUV kleidicv_bgr_to_yuv420sp_with_fallback + // yuv_to_bgr_sp static inline int kleidicv_yuv_to_bgr_sp_with_fallback( const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 218fb8902e8c118b2b6e146f40946cbc144dd772..8f814faac5f673765046337e815dc50a3d9e5bf5 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -718,7 +718,8 @@ static void yuv_sp(Function f, benchmark::State& state) { get_source_buffer_b(), (image_width / 2) * sizeof(uint8_t), get_destination_buffer_a(), - image_width * sizeof(uint8_t), image_width, image_height, true); + image_width * sizeof(uint8_t) * OutChannels, image_width, + image_height, true); }); } @@ -742,6 +743,68 @@ static void yuv_sp_to_bgra(benchmark::State& state) { } BENCHMARK(yuv_sp_to_bgra); +template +static void rgb_to_yuv_imp(Function f, benchmark::State& state) { + bench_functor(state, [f]() { + (void)f(get_source_buffer_a(), + InChannels * image_width * sizeof(uint8_t), + get_destination_buffer_a(), + image_width * sizeof(uint8_t), image_width, image_height, true); + }); +} + +static void rgb_to_yuv_p(benchmark::State& state) { + rgb_to_yuv_imp<3>(kleidicv_rgb_to_yuv420p_u8, state); +} +BENCHMARK(rgb_to_yuv_p); + +static void rgba_to_yuv_p(benchmark::State& state) { + rgb_to_yuv_imp<4>(kleidicv_rgba_to_yuv420p_u8, state); +} +BENCHMARK(rgba_to_yuv_p); + +static void bgr_to_yuv_p(benchmark::State& state) { + rgb_to_yuv_imp<3>(kleidicv_bgr_to_yuv420p_u8, state); +} +BENCHMARK(bgr_to_yuv_p); + +static void bgra_to_yuv_p(benchmark::State& state) { + rgb_to_yuv_imp<4>(kleidicv_bgra_to_yuv420p_u8, state); +} +BENCHMARK(bgra_to_yuv_p); + +template +static void rgb_to_yuv_sp_imp(Function f, benchmark::State& state) { + bench_functor(state, [f]() { + (void)f( + get_source_buffer_a(), + InChannels * image_width * sizeof(uint8_t), + get_destination_buffer_a(), image_width * sizeof(uint8_t), + get_destination_buffer_b(), + (image_width / 2) * sizeof(uint8_t), image_width, image_height, true); + }); +} + +static void rgb_to_yuv_sp(benchmark::State& state) { + rgb_to_yuv_sp_imp<3>(kleidicv_rgb_to_yuv420sp_u8, state); +} +BENCHMARK(rgb_to_yuv_sp); + +static void rgba_to_yuv_sp(benchmark::State& state) { + rgb_to_yuv_sp_imp<4>(kleidicv_rgba_to_yuv420sp_u8, state); +} +BENCHMARK(rgba_to_yuv_sp); + +static void bgr_to_yuv_sp(benchmark::State& state) { + rgb_to_yuv_sp_imp<3>(kleidicv_bgr_to_yuv420sp_u8, state); +} +BENCHMARK(bgr_to_yuv_sp); + +static void bgra_to_yuv_sp(benchmark::State& state) { + rgb_to_yuv_sp_imp<4>(kleidicv_bgra_to_yuv420sp_u8, state); +} +BENCHMARK(bgra_to_yuv_sp); + template static void morphology(Function f, benchmark::State& state) { kleidicv_morphology_context_t* context = nullptr; diff --git a/conformity/opencv/test_rgb2yuv420p.cpp b/conformity/opencv/test_rgb2yuv420p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f6a9e7dc7043821953ccd17caf0f7c18fa577b04 --- /dev/null +++ b/conformity/opencv/test_rgb2yuv420p.cpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tests.h" + +template +static cv::Mat exec_cvtcolor(cv::Mat& input) { + cv::Mat result; + cv::cvtColor(input, result, Code); + return result; +} + +#if MANAGER +template +bool test_rgb2yuv420p(int index, RecreatedMessageQueue& request_queue, + RecreatedMessageQueue& reply_queue) { + cv::RNG rng(0); + + auto check = [&](size_t x, size_t y) -> bool { + cv::Mat input(x, y, CV_8UC(channels)); + rng.fill(input, cv::RNG::UNIFORM, 0, 255); + + cv::Mat actual = exec_cvtcolor(input); + cv::Mat expected = + get_expected_from_subordinate(index, request_queue, reply_queue, input); + + if (are_matrices_different(0, actual, expected)) { + fail_print_matrices(x, y, input, actual, expected); + return true; + } + return false; + }; + + // OpenCV only accepts two-plane images with an even number of columns & rows. + for (size_t x = 4; x <= 32; x += 2) { + for (size_t y = 2; y <= 32; y += 2) { + if (check(x, y)) { + return true; + } + } + } + + // Check taller images - this number of rows was necessary to trigger a bug on + // a machine with 64 cores. + if (check(36, 1000)) { + return true; + } + + return false; +} +#endif + +#define CVTCOLOR_TEST(code, channel) \ + TEST(#code, (test_rgb2yuv420p), \ + exec_cvtcolor) + +std::vector& rgb2yuv420_tests_get() { + // clang-format off + static std::vector tests = { + CVTCOLOR_TEST(RGB2YUV_YV12 , 3), + CVTCOLOR_TEST(RGBA2YUV_YV12, 4), + CVTCOLOR_TEST(BGR2YUV_YV12 , 3), + CVTCOLOR_TEST(BGRA2YUV_YV12, 4), + CVTCOLOR_TEST(BGR2YUV_IYUV , 3), + CVTCOLOR_TEST(BGRA2YUV_IYUV, 4), + CVTCOLOR_TEST(RGB2YUV_IYUV , 3), + CVTCOLOR_TEST(RGBA2YUV_IYUV, 4), + }; + // clang-format on + return tests; +} diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp index 55744af48793171c5a0792a859538cf6c354a101..1d4df10ee43f9545fe18f29344e59453ad7bb786 100644 --- a/conformity/opencv/tests.cpp +++ b/conformity/opencv/tests.cpp @@ -45,6 +45,7 @@ std::vector all_tests = merge_tests({ blur_and_downsample_tests_get, scharr_interleaved_tests_get, median_blur_tests_get, + rgb2yuv420_tests_get, }); #if MANAGER diff --git a/conformity/opencv/tests.h b/conformity/opencv/tests.h index b7d4238b8e2965a6effae11e8c35cf1338a4da1e..70606db0c04ecf23c46396c250108a4352958da2 100644 --- a/conformity/opencv/tests.h +++ b/conformity/opencv/tests.h @@ -29,5 +29,6 @@ std::vector& warp_perspective_tests_get(); std::vector& blur_and_downsample_tests_get(); std::vector& scharr_interleaved_tests_get(); std::vector& median_blur_tests_get(); +std::vector& rgb2yuv420_tests_get(); #endif // KLEIDICV_OPENCV_CONFORMITY_TESTS_H_ diff --git a/doc/functionality.md b/doc/functionality.md index 43ae983c8825dc80c334ebe3b90bf82ab10454fe..7774d34f4bf00f83b8ddcc5b002eb3e3043b274e 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -29,26 +29,34 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Bitwise And | x | ## Color conversions -| | u8 | -|--------------|-----| -| Gray-RGB | x | -| Gray-RGBA | x | -| RGB-BGR | x | -| BGR-RGB | x | -| RGBA-BGRA | x | -| BGRA-RGBA | x | -| YUV420-BGR | x | -| YUV420-BGRA | x | -| YUV420-RGB | x | -| YUV420-RGBA | x | -| YUV-BGR | x | -| YUV-RGB | x | -| YUV-BGRA | x | -| YUV-RGBA | x | -| RGB-YUV | x | -| RGBA-YUV | x | -| BGR-YUV | x | -| BGRA-YUV | x | +| | u8 | +|-------------------|-----| +| Gray-RGB | x | +| Gray-RGBA | x | +| RGB-BGR | x | +| BGR-RGB | x | +| RGBA-BGRA | x | +| BGRA-RGBA | x | +| YUV420-BGR | x | +| YUV420-BGRA | x | +| YUV420-RGB | x | +| YUV420-RGBA | x | +| YUV-BGR | x | +| YUV-RGB | x | +| YUV-BGRA | x | +| YUV-RGBA | x | +| RGB-YUV | x | +| RGBA-YUV | x | +| BGR-YUV | x | +| BGRA-YUV | x | +| RGB-YUV420p | x | +| RGBA-YUV420p | x | +| BGR-YUV420p | x | +| BGRA-YUV420p | x | +| RGB-YUV420sp | x | +| RGBA-YUV420sp | x | +| BGR-YUV420sp | x | +| BGRA-YUV420sp | x | ## Data type conversions | | u8 | s8 | f32 | diff --git a/doc/opencv.md b/doc/opencv.md index 6d0021177b8b54f0354392f43355e5b6768e1a05..2cd83c87b4873d9c4bea5ca16081ab2fadab22cc 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -98,6 +98,39 @@ Notes on parameters: * `src.depth()` - only supports `CV_8U` depth. * `src.channels()` - supports 3 for RGB/BGR and 4 for RGBA/BGRA. + +#### [`COLOR_RGB2YUV_IYUV`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=V%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_RGB2YUV_IYUV%C2%A0,-Python%3A%20cv.COLOR_RGB2YUV_IYUV), [`COLOR_BGR2YUV_IYUV`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=synonym%20to%20I420-,COLOR_BGR2YUV_IYUV,-Python%3A%20cv.COLOR_BGR2YUV_IYUV), [`COLOR_RGBA2YUV_IYUV`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=V%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_RGBA2YUV_IYUV,-Python%3A%20cv.COLOR_RGBA2YUV_IYUV), [`COLOR_BGRA2YUV_IYUV`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=synonym%20to%20I420-,COLOR_BGRA2YUV_IYUV,-Python%3A%20cv.COLOR_BGRA2YUV_IYUV) + +RGB/BGR(A/X) to YUV420 planar (IYUV/YUV420p) conversion. +This transformation outputs three separate planes: Y, U, and V, where chroma channels (U and V) are subsampled by 2 in both horizontal and vertical directions. +RGBX/BGRX and RGBA/BGRA formats are supported, with the last (X or Alpha) channel ignored during the conversion. + +| | RGB | BGR | RGBA | BGRA | RGBX | BGRX | +|---------|-----|-----|------|------|------|------| +| YUV420p | x | x | x | x | x | x | + +**Notes on parameters:** +- `src.depth()` — only supports `CV_8U` depth. +- `src.channels()` — supports 3 (RGB/BGR), 4 (RGBA/BGRA/RGBX/BGRX). For RGBX/BGRX, the X channel is ignored. +- `dst` — output is a single image containing the Y plane followed by U and V planes (IYUV layout). + + +#### [`COLOR_RGB2YUV_YV12`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=synonym%20to%20I420-,COLOR_RGB2YUV_YV12,-Python%3A%20cv.COLOR_RGB2YUV_YV12), [`COLOR_BGR2YUV_YV12`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=U%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_BGR2YUV_YV12,-Python%3A%20cv.COLOR_BGR2YUV_YV12), [`COLOR_RGBA2YUV_YV12`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=U%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_RGBA2YUV_YV12,-Python%3A%20cv.COLOR_RGBA2YUV_YV12), [`COLOR_BGRA2YUV_YV12`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0ab91f1a5041e1d4b7b0c1f4d0b69479e5:~:text=U%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_BGRA2YUV_YV12,-Python%3A%20cv.COLOR_BGRA2YUV_YV12) + +RGB/BGR(A/X) to YUV420 planar (YV12) conversion. +Like IYUV, this format stores Y, U, and V planes with 4:2:0 chroma subsampling. The only difference is that U and V planes are **swapped** in memory layout: Y is followed by V then U. +RGBX/BGRX and RGBA/BGRA formats are supported, with the last (X or Alpha) channel ignored during the conversion. + +| | RGB | BGR | RGBA | BGRA | RGBX | BGRX | +|---------|-----|-----|------|------|------|------| +| YUV420p | x | x | x | x | x | x | + +**Notes on parameters:** +- `src.depth()` — only supports `CV_8U` depth. +- `src.channels()` — supports 3 (RGB/BGR), 4 (RGBA/BGRA/RGBX/BGRX). For RGBX/BGRX, the X channel is ignored. +- `dst` — output is a single image containing the Y plane followed by V and U planes (YV12 layout). + + ### [`cv::GaussianBlur()`](https://docs.opencv.org/4.11.0/d4/d86/group__imgproc__filter.html#gae8bdcd9154ed5ca3cbc1766d960f45c1) > ⚠️ **The operation is not bitexact with OpenCV due to rounding differences even if ALGO_HINT_ACCURATE is used as the hint parameter.** diff --git a/kleidicv/include/kleidicv/conversions/rgb_to_yuv420p.h b/kleidicv/include/kleidicv/conversions/rgb_to_yuv420p.h new file mode 100644 index 0000000000000000000000000000000000000000..14c0bebba9e403967e7ad8979a7c319af28ecb9a --- /dev/null +++ b/kleidicv/include/kleidicv/conversions/rgb_to_yuv420p.h @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_CONVERSIONS_RGB_TO_YUV420P_H +#define KLEIDICV_CONVERSIONS_RGB_TO_YUV420P_H + +#include + +#include "kleidicv/config.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/types.h" + +extern "C" { + +// For internal use only. See instead `kleidicv_rgb_to_yuv420p_u8`. +// Converts a stripe (i.e., a row range) of a RGB image to a planar YUV420 image +// format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_rgb_to_yuv420p_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + size_t begin, size_t end); + +// For internal use only. See instead `kleidicv_rgba_to_yuv420p_u8`. +// Converts a stripe (i.e., a row range) of a RGBA image to a planar YUV420 +// image format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_rgba_to_yuv420p_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + size_t begin, size_t end); + +// For internal use only. See instead `kleidicv_bgr_to_yuv420p_u8`. +// Converts a stripe (i.e., a row range) of a BGR image to a planar YUV420 image +// format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_bgr_to_yuv420p_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + size_t begin, size_t end); + +// For internal use only. See instead `kleidicv_bgra_to_yuv420p_u8`. +// Converts a stripe (i.e., a row range) of a BGRA image to a planar YUV420 +// image format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_bgra_to_yuv420p_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + size_t begin, size_t end); +} + +namespace kleidicv { + +namespace neon { +kleidicv_error_t rgb_to_yuv420p_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool is_nv21, size_t begin, + size_t end); + +kleidicv_error_t rgba_to_yuv420p_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end); + +kleidicv_error_t bgr_to_yuv420p_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool is_nv21, size_t begin, + size_t end); + +kleidicv_error_t bgra_to_yuv420p_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end); + +} // namespace neon + +} // namespace kleidicv + +#endif // KLEIDICV_CONVERSIONS_RGB_TO_YUV420P_H diff --git a/kleidicv/include/kleidicv/conversions/rgb_to_yuv420sp.h b/kleidicv/include/kleidicv/conversions/rgb_to_yuv420sp.h new file mode 100644 index 0000000000000000000000000000000000000000..2e7993bf3b186b16a938baac76e43dce858f3ddd --- /dev/null +++ b/kleidicv/include/kleidicv/conversions/rgb_to_yuv420sp.h @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_CONVERSIONS_RGB_TO_YUV420SP_H +#define KLEIDICV_CONVERSIONS_RGB_TO_YUV420SP_H + +#include + +#include "kleidicv/config.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/types.h" + +extern "C" { + +// For internal use only. See instead `kleidicv_rgb_to_yuv420sp_u8`. +// Converts a stripe (i.e., a row range) of a RGB image to a planar YUV420 image +// format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_rgb_to_yuv420sp_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, + size_t height, bool is_nv21, size_t begin, size_t end); + +// For internal use only. See instead `kleidicv_rgba_to_yuv420sp_u8`. +// Converts a stripe (i.e., a row range) of a RGBA image to a planar YUV420 +// image format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_rgba_to_yuv420sp_stripe_u8, + const uint8_t *src, size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, size_t uv_stride, + size_t width, size_t height, bool is_nv21, + size_t begin, size_t end); + +// For internal use only. See instead `kleidicv_bgr_to_yuv420sp_u8`. +// Converts a stripe (i.e., a row range) of a BGR image to a planar YUV420 image +// format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_bgr_to_yuv420sp_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, + size_t height, bool is_nv21, size_t begin, size_t end); + +// For internal use only. See instead `kleidicv_bgra_to_yuv420sp_u8`. +// Converts a stripe (i.e., a row range) of a BGRA image to a planar YUV420 +// image format (I420 or YV12). The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_bgra_to_yuv420sp_stripe_u8, + const uint8_t *src, size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, size_t uv_stride, + size_t width, size_t height, bool is_nv21, + size_t begin, size_t end); +} + +namespace kleidicv { + +namespace neon { +kleidicv_error_t rgb_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end); + +kleidicv_error_t rgba_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end); + +kleidicv_error_t bgr_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end); + +kleidicv_error_t bgra_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end); + +} // namespace neon + +} // namespace kleidicv + +#endif // KLEIDICV_CONVERSIONS_RGB_TO_YUV420SP_H diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 09761af88fadb00061006d20f51ce2766d7e699d..025bf3592cbb0b129d6468b68604bf541c943203 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -811,6 +811,118 @@ KLEIDICV_API_DECLARATION(kleidicv_rgba_to_yuv_u8, const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height); +#define KLEIDICV_OP_RGB_TO_YUV420P(name) \ + kleidicv_error_t name(const uint8_t *src, size_t src_stride, uint8_t *dst, \ + size_t dst_stride, size_t width, size_t height, \ + bool is_nv21); + +/// Converts an interleaved RGB, RGBA, BGR, or BGRA image to planar YUV420 +/// format (I420 or YV12 layout). All channels are 8-bit wide. If the input +/// format includes an alpha channel, it is ignored. +/// +/// ### Source format +/// Source data has 3 or 4 interleaved channels per pixel: +/// - R, G, B +/// - B, G, R +/// - R, G, B, Alpha +/// - B, G, R, Alpha +/// +/// One pixel occupies 3 or 4 bytes, depending on the format. There is no +/// padding between pixels. +/// +/// ### Destination format: Planar YUV420 +/// The output buffer consists of three planes concatenated in memory: +/// - Y plane: full resolution, size = width × height +/// - U plane: quarter resolution, size = (width / 2) × (height / 2) +/// - V plane: quarter resolution, size = (width / 2) × (height / 2) +/// +/// If `is_nv21 == false`, the format is **I420**: Y + U + V +/// If `is_nv21 == true`, the format is **YV12**: Y + V + U +/// +/// Width and height refer to the **full image** dimensions. Total number of +/// pixels must not exceed @ref KLEIDICV_MAX_IMAGE_PIXELS. +/// +/// @param src Pointer to the source buffer containing interleaved +/// RGBX/BGRX data. +/// Must be non-null. +/// @param src_stride Byte offset between the start of one source row and the +/// next. Must be at least `(source channel count) * width`, +/// unless the image has only one row. +/// @param dst Pointer to the destination buffer to store Y + U + V or Y +/// + +/// V + U data. Must be non-null. +/// @param dst_stride Stride (in bytes) between rows in the Y plane of the +/// output. +/// Must be at least `width`. +/// @param width Number of pixels in a row. +/// @param height Number of rows in the stripe. +/// @param is_nv21 If true, use YV12 layout (Y + V + U). Otherwise, +/// use I420 layout (Y + U + V). +KLEIDICV_OP_RGB_TO_YUV420P(kleidicv_rgb_to_yuv420p_u8); +/// @copydoc kleidicv_rgb_to_yuv420p_u8 +KLEIDICV_OP_RGB_TO_YUV420P(kleidicv_rgba_to_yuv420p_u8); +/// @copydoc kleidicv_rgb_to_yuv420p_u8 +KLEIDICV_OP_RGB_TO_YUV420P(kleidicv_bgr_to_yuv420p_u8); +/// @copydoc kleidicv_rgb_to_yuv420p_u8 +KLEIDICV_OP_RGB_TO_YUV420P(kleidicv_bgra_to_yuv420p_u8); + +#define KLEIDICV_OP_RGB_TO_YUV420SP(name) \ + kleidicv_error_t name(const uint8_t *src, size_t src_stride, uint8_t *y_dst, \ + size_t y_stride, uint8_t *uv_dst, size_t uv_stride, \ + size_t width, size_t height, bool is_nv21); + +/// Converts an interleaved RGB, RGBA, BGR, or BGRA image to semi-planar +/// YUV420 format (NV12 or NV21 layout). All channels are 8-bit wide. +/// If the input format includes an alpha channel, it is ignored. +/// +/// ### Source Format +/// Source data has 3 or 4 interleaved channels per pixel: +/// - R, G, B +/// - B, G, R +/// - R, G, B, Alpha +/// - B, G, R, Alpha +/// +/// Each pixel occupies 3 or 4 bytes, depending on the format. There is no +/// padding between pixels. +/// +/// ### Destination Format: Semi-Planar YUV420 +/// The output consists of two planes: +/// - Y plane: full resolution, size = width × height +/// - UV plane: interleaved chroma (U and V) at quarter resolution, +/// size = (width / 2) × (height / 2) × 2 bytes per chroma sample +/// pair +/// +/// If `is_nv21 == false`, the format is **NV12**: Y + interleaved UV +/// If `is_nv21 == true`, the format is **NV21**: Y + interleaved VU +/// +/// Width and height refer to the full image dimensions. +/// Total number of pixels must not exceed @ref KLEIDICV_MAX_IMAGE_PIXELS. +/// +/// @param src Pointer to the source buffer containing interleaved +/// RGBX/BGRX data. Must be non-null. +/// @param src_stride Byte offset between the start of one source row and the +/// next. Must be at least `(source channel count) * width`, +/// unless the image has only one row. +/// @param y_dst Pointer to the destination Y plane. Must be non-null. +/// @param y_stride Byte offset between the start of one Y row and the next. +/// Must be at least `width`. +/// @param uv_dst Pointer to the destination UV plane (interleaved). +/// Must be non-null. +/// @param uv_stride Byte offset between the start of one UV row and the +/// next. +/// Must be at least `__builtin_align_up(width, 2)`. +/// @param width Number of pixels per row in the image. +/// @param height Number of rows in the image. +/// @param is_nv21 If true, UV plane is written in VU order (NV21). +/// Otherwise, UV plane is written in UV order (NV12). +KLEIDICV_OP_RGB_TO_YUV420SP(kleidicv_rgb_to_yuv420sp_u8); +/// @copydoc kleidicv_rgb_to_yuv420sp_u8 +KLEIDICV_OP_RGB_TO_YUV420SP(kleidicv_rgba_to_yuv420sp_u8); +/// @copydoc kleidicv_rgb_to_yuv420sp_u8 +KLEIDICV_OP_RGB_TO_YUV420SP(kleidicv_bgr_to_yuv420sp_u8); +/// @copydoc kleidicv_rgb_to_yuv420sp_u8 +KLEIDICV_OP_RGB_TO_YUV420SP(kleidicv_bgra_to_yuv420sp_u8); + /// Performs a comparison of each element's value in `src` with respect to a /// caller defined threshold. The strictly larger elements are set to /// `value` and the rest to 0. diff --git a/kleidicv/src/conversions/rgb_to_yuv420_neon.h b/kleidicv/src/conversions/rgb_to_yuv420_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..7bc34fd024f9e0f020d471293615f5bbe0c6862c --- /dev/null +++ b/kleidicv/src/conversions/rgb_to_yuv420_neon.h @@ -0,0 +1,407 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_RGB_TO_YUV420_H +#define KLEIDICV_RGB_TO_YUV420_H + +#include +#include + +#include "kleidicv/conversions/rgb_to_yuv420p.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/neon.h" + +namespace kleidicv::neon { + +// Coefficients for RGB to YUV420p conversion +static const int kWeightScale = 20; +static const int kRYWeight = + 269484; // 0.299055 * (236-16)/256 * (1 << kWeightScale) +static const int kGYWeight = + 528482; // 0.586472 * (236-16)/256 * (1 << kWeightScale) +static const int kBYWeight = + 102760; // 0.114035 * (236-16)/256 * (1 << kWeightScale) +static const int kRUWeight = -155188; // -0.148 * (1 << (kWeightScale-1)) +static const int kGUWeight = -305135; // -0.291 * (1 << (kWeightScale-1)) +static const int kBUWeight = 460324; // 0.439 * (1 << (kWeightScale-1)) +static const int kGVWeight = -385875; // -0.368 * (1 << (kWeightScale-1)) +static const int kBVWeight = -74448; // -0.071 * (1 << (kWeightScale-1)) + +template +class RGBxorBGRxToYUV420 { + public: + static kleidicv_error_t rgb2yuv420p_operation( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, size_t begin, size_t end) { + size_t row_begin = begin * 2; + size_t row_end = std::min(height, end * 2); + + const size_t vector_width = (width >> 5) << 5; + + const uint8_t *src_row = nullptr; + uint8_t *y_row = nullptr; + uint8_t *u_row = nullptr; + uint8_t *v_row = nullptr; + for (size_t h = row_begin; h < row_end; h++) { + src_row = src + src_stride * h; + y_row = y_dst + y_stride * h; + + bool evenRow = (h % 2) == 0; + + if (evenRow) { + if constexpr (INTERLEAVE) { + u_row = uv_dst + uv_stride * (h / 2); + } else { + u_row = uv_dst + uv_stride * (h / 4) + ((h / 2) % 2) * (width / 2); + v_row = uv_dst + uv_stride * ((h + height + 1) / 4) + + (((h + height + 1) / 2) % 2) * (width / 2); + } + } + + size_t w = 0; + vector_path(src_row, y_row, u_row, v_row, is_nv21, w, vector_width, + evenRow); + scalar_path(src_row, y_row, u_row, v_row, is_nv21, w, width, evenRow); + } + + return KLEIDICV_OK; + } + + private: + static void vector_path(const uint8_t *src_row, uint8_t *y_row, + uint8_t *u_row, uint8_t *v_row, bool is_nv21, + size_t &w, size_t vector_width, bool evenRow) { + const size_t vsize = 16; + for (; w < (vector_width / 2); w += vsize) { + // processing (2*vsize) pixels at once + uint8x16_t b0, b1, g0, g1, r0, r1; + load_rgb(b0, b1, g0, g1, r0, r1, src_row, w); + // Convert both vectors to luminance (Y channel) + uint8x16_t y0 = rgb_to_y(r0, g0, b0); + uint8x16_t y1 = rgb_to_y(r1, g1, b1); + + // Store Y values: 32 pixels total (two vectors) + vst1q_u8(y_row + 2 * w, y0); + vst1q_u8(y_row + 2 * w + vsize, y1); + + if (evenRow) { + uint8x16_t u, v; + rgb_to_uv(r0, r1, g0, g1, b0, b1, u, v); + if (is_nv21) { + std::swap(u, v); + } + if constexpr (INTERLEAVE) { + uint8x16x2_t uv; + uv.val[0] = u; + uv.val[1] = v; + vst2q_u8(u_row + w * 2, uv); + } else { + vst1q_u8(u_row + w, u); + vst1q_u8(v_row + w, v); + } + } + } + } + + static void scalar_path(const uint8_t *src_row, uint8_t *y_row, + uint8_t *u_row, uint8_t *v_row, bool is_nv21, + size_t &w, size_t width, bool evenRow) { + for (w = w * 2; w < width; w += 1) { + uint8_t b0{}, g0{}, r0{}; + constexpr size_t scn = ALPHA ? 4 : 3; + bool evenCol = (w % 2) == 0; + b0 = src_row[(w + 0) * scn + 0]; + g0 = src_row[(w + 0) * scn + 1]; + r0 = src_row[(w + 0) * scn + 2]; + // Swap R and B channels if in RGB format + if constexpr (RGB) { + std::swap(b0, r0); + } + uint8_t y0 = rgb_to_y(r0, g0, b0); + y_row[w + 0] = y0; + + if (evenRow && evenCol) { + uint8_t uu = 0, vv = 0; + rgb_to_uv(r0, g0, b0, uu, vv); + if (is_nv21) { + std::swap(uu, vv); + } + + if constexpr (INTERLEAVE) { + u_row[w + 0] = uu; + u_row[w + 1] = vv; + } else { + u_row[w >> 1] = uu; + v_row[w >> 1] = vv; + } + } + } + } + + static uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) { + const int shifted16 = (16 << kWeightScale); + const int halfShift = (1 << (kWeightScale - 1)); + int yy = + kRYWeight * r + kGYWeight * g + kBYWeight * b + halfShift + shifted16; + + return std::clamp(yy >> kWeightScale, 0, 0xff); + } + + static uint8x16_t rgb_to_y(const uint8x16_t &r, const uint8x16_t &g, + const uint8x16_t &b) { + const int shifted16 = (16 << kWeightScale); + const int halfShift = (1 << (kWeightScale - 1)); + + // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore) + // These are needed to expand each group of 4 bytes into a full 32-bit lane + uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, + 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff}; + + uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff, + 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff}; + + uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, + 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff}; + + uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff, + 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff}; + + // Expand each 8-bit channel into 32-bit vectors using table lookup and + // reinterpret + uint32x4_t r_lo_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, index_lo_lo)); + uint32x4_t r_lo_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, index_lo_hi)); + uint32x4_t r_hi_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, index_hi_lo)); + uint32x4_t r_hi_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, index_hi_hi)); + + uint32x4_t g_lo_lo = vreinterpretq_u32_u8(vqtbl1q_u8(g, index_lo_lo)); + uint32x4_t g_lo_hi = vreinterpretq_u32_u8(vqtbl1q_u8(g, index_lo_hi)); + uint32x4_t g_hi_lo = vreinterpretq_u32_u8(vqtbl1q_u8(g, index_hi_lo)); + uint32x4_t g_hi_hi = vreinterpretq_u32_u8(vqtbl1q_u8(g, index_hi_hi)); + + uint32x4_t b_lo_lo = vreinterpretq_u32_u8(vqtbl1q_u8(b, index_lo_lo)); + uint32x4_t b_lo_hi = vreinterpretq_u32_u8(vqtbl1q_u8(b, index_lo_hi)); + uint32x4_t b_hi_lo = vreinterpretq_u32_u8(vqtbl1q_u8(b, index_hi_lo)); + uint32x4_t b_hi_hi = vreinterpretq_u32_u8(vqtbl1q_u8(b, index_hi_hi)); + + // Prepare constants for fixed-point MAC (multiply-accumulate) + uint32x4_t v_kRYWeight = vdupq_n_u32(kRYWeight); + uint32x4_t v_kGYWeight = vdupq_n_u32(kGYWeight); + uint32x4_t v_kBYWeight = vdupq_n_u32(kBYWeight); + uint32x4_t y_lo_lo = vdupq_n_u32(halfShift + shifted16); + uint32x4_t y_lo_hi = vdupq_n_u32(halfShift + shifted16); + uint32x4_t y_hi_lo = vdupq_n_u32(halfShift + shifted16); + uint32x4_t y_hi_hi = vdupq_n_u32(halfShift + shifted16); + + // Apply Y = kR*R + kG*G + kB*B + rounding bias + y_lo_lo = vmlaq_u32(y_lo_lo, r_lo_lo, v_kRYWeight); + y_lo_hi = vmlaq_u32(y_lo_hi, r_lo_hi, v_kRYWeight); + y_hi_lo = vmlaq_u32(y_hi_lo, r_hi_lo, v_kRYWeight); + y_hi_hi = vmlaq_u32(y_hi_hi, r_hi_hi, v_kRYWeight); + + y_lo_lo = vmlaq_u32(y_lo_lo, g_lo_lo, v_kGYWeight); + y_lo_hi = vmlaq_u32(y_lo_hi, g_lo_hi, v_kGYWeight); + y_hi_lo = vmlaq_u32(y_hi_lo, g_hi_lo, v_kGYWeight); + y_hi_hi = vmlaq_u32(y_hi_hi, g_hi_hi, v_kGYWeight); + + y_lo_lo = vmlaq_u32(y_lo_lo, b_lo_lo, v_kBYWeight); + y_lo_hi = vmlaq_u32(y_lo_hi, b_lo_hi, v_kBYWeight); + y_hi_lo = vmlaq_u32(y_hi_lo, b_hi_lo, v_kBYWeight); + y_hi_hi = vmlaq_u32(y_hi_hi, b_hi_hi, v_kBYWeight); + + // Normalize down by right-shifting the fixed-point result + y_lo_lo = vshrq_n_u32(y_lo_lo, kWeightScale); + y_lo_hi = vshrq_n_u32(y_lo_hi, kWeightScale); + y_hi_lo = vshrq_n_u32(y_hi_lo, kWeightScale); + y_hi_hi = vshrq_n_u32(y_hi_hi, kWeightScale); + + // Pack the result into 8-bit vector lanes + uint8x16x4_t y; + y.val[0] = vreinterpretq_u8_u32(y_lo_lo); + y.val[1] = vreinterpretq_u8_u32(y_lo_hi); + y.val[2] = vreinterpretq_u8_u32(y_hi_lo); + y.val[3] = vreinterpretq_u8_u32(y_hi_hi); + + // Final shuffle to extract the first byte of each lane into a flat vector + uint8x16_t index = {0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60}; + uint8x16_t output = vqtbl4q_u8(y, index); + + return output; + } + + static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t &u, + uint8_t &v) { + const int halfShift = (1 << (kWeightScale - 1)); + const int shifted128 = (128 << kWeightScale); + int uu = + kRUWeight * r + kGUWeight * g + kBUWeight * b + halfShift + shifted128; + int vv = + kBUWeight * r + kGVWeight * g + kBVWeight * b + halfShift + shifted128; + + u = std::clamp(uu >> kWeightScale, 0, 0xff); + v = std::clamp(vv >> kWeightScale, 0, 0xff); + } + + static void rgb_to_uv(const uint8x16_t &r0, const uint8x16_t &r1, + const uint8x16_t &g0, const uint8x16_t &g1, + const uint8x16_t &b0, const uint8x16_t &b1, + uint8x16_t &u, uint8x16_t &v) { + // NEON lookup indices to extract even-indexed bytes into 32-bit lanes + uint8x16_t index_lo = {0, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, + 4, 0xff, 0xff, 0xff, 6, 0xff, 0xff, 0xff}; + + uint8x16_t index_hi = {8, 0xff, 0xff, 0xff, 10, 0xff, 0xff, 0xff, + 12, 0xff, 0xff, 0xff, 14, 0xff, 0xff, 0xff}; + + // Extend RGB0 and RGB1 from uint8 to int32 using table lookups and + // reinterpret casts + int32x4_t r_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(r0, index_lo)); + int32x4_t r_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(r0, index_hi)); + int32x4_t r_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(r1, index_lo)); + int32x4_t r_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(r1, index_hi)); + + int32x4_t g_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(g0, index_lo)); + int32x4_t g_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(g0, index_hi)); + int32x4_t g_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(g1, index_lo)); + int32x4_t g_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(g1, index_hi)); + + int32x4_t b_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(b0, index_lo)); + int32x4_t b_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(b0, index_hi)); + int32x4_t b_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(b1, index_lo)); + int32x4_t b_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(b1, index_hi)); + + // Constants for U/V calculation + const int halfShift = (1 << (kWeightScale - 1)); + const int shifted128 = (128 << kWeightScale); + + // ---------------- U (Cb) Component ---------------- + int32x4_t v_kRUWeight = vdupq_n_s32(kRUWeight); + int32x4_t v_kGUWeight = vdupq_n_s32(kGUWeight); + int32x4_t v_kBUWeight = vdupq_n_s32(kBUWeight); + + // Initialize accumulation with bias + int32x4_t u_lo_lo = vdupq_n_s32(halfShift + shifted128); + int32x4_t u_lo_hi = vdupq_n_s32(halfShift + shifted128); + int32x4_t u_hi_lo = vdupq_n_s32(halfShift + shifted128); + int32x4_t u_hi_hi = vdupq_n_s32(halfShift + shifted128); + + // U = R * kRU + G * kGU + B * kBU + bias + u_lo_lo = vmlaq_s32(u_lo_lo, r_lo_lo, v_kRUWeight); + u_lo_hi = vmlaq_s32(u_lo_hi, r_lo_hi, v_kRUWeight); + u_hi_lo = vmlaq_s32(u_hi_lo, r_hi_lo, v_kRUWeight); + u_hi_hi = vmlaq_s32(u_hi_hi, r_hi_hi, v_kRUWeight); + + u_lo_lo = vmlaq_s32(u_lo_lo, g_lo_lo, v_kGUWeight); + u_lo_hi = vmlaq_s32(u_lo_hi, g_lo_hi, v_kGUWeight); + u_hi_lo = vmlaq_s32(u_hi_lo, g_hi_lo, v_kGUWeight); + u_hi_hi = vmlaq_s32(u_hi_hi, g_hi_hi, v_kGUWeight); + + u_lo_lo = vmlaq_s32(u_lo_lo, b_lo_lo, v_kBUWeight); + u_lo_hi = vmlaq_s32(u_lo_hi, b_lo_hi, v_kBUWeight); + u_hi_lo = vmlaq_s32(u_hi_lo, b_hi_lo, v_kBUWeight); + u_hi_hi = vmlaq_s32(u_hi_hi, b_hi_hi, v_kBUWeight); + + // Normalize to 8-bit by shifting + u_lo_lo = vshrq_n_s32(u_lo_lo, kWeightScale); + u_lo_hi = vshrq_n_s32(u_lo_hi, kWeightScale); + u_hi_lo = vshrq_n_s32(u_hi_lo, kWeightScale); + u_hi_hi = vshrq_n_s32(u_hi_hi, kWeightScale); + + // Pack into single 16-byte vector + uint8x16x4_t output; + output.val[0] = vreinterpretq_u8_s32(u_lo_lo); + output.val[1] = vreinterpretq_u8_s32(u_lo_hi); + output.val[2] = vreinterpretq_u8_s32(u_hi_lo); + output.val[3] = vreinterpretq_u8_s32(u_hi_hi); + + // Index vector for reordering bytes into linear output + uint8x16_t index = {0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60}; + + u = vqtbl4q_u8(output, index); + + // ---------------- V (Cr) Component ---------------- + int32x4_t v_kGVWeight = vdupq_n_s32(kGVWeight); + int32x4_t v_kBVWeight = vdupq_n_s32(kBVWeight); + v_kRUWeight = vdupq_n_s32(kBUWeight); + + int32x4_t v_lo_lo = vdupq_n_s32(halfShift + shifted128); + int32x4_t v_lo_hi = vdupq_n_s32(halfShift + shifted128); + int32x4_t v_hi_lo = vdupq_n_s32(halfShift + shifted128); + int32x4_t v_hi_hi = vdupq_n_s32(halfShift + shifted128); + + // V = R * kBU + G * kGV + B * kBV + bias + v_lo_lo = vmlaq_s32(v_lo_lo, r_lo_lo, v_kRUWeight); + v_lo_hi = vmlaq_s32(v_lo_hi, r_lo_hi, v_kRUWeight); + v_hi_lo = vmlaq_s32(v_hi_lo, r_hi_lo, v_kRUWeight); + v_hi_hi = vmlaq_s32(v_hi_hi, r_hi_hi, v_kRUWeight); + + v_lo_lo = vmlaq_s32(v_lo_lo, g_lo_lo, v_kGVWeight); + v_lo_hi = vmlaq_s32(v_lo_hi, g_lo_hi, v_kGVWeight); + v_hi_lo = vmlaq_s32(v_hi_lo, g_hi_lo, v_kGVWeight); + v_hi_hi = vmlaq_s32(v_hi_hi, g_hi_hi, v_kGVWeight); + + v_lo_lo = vmlaq_s32(v_lo_lo, b_lo_lo, v_kBVWeight); + v_lo_hi = vmlaq_s32(v_lo_hi, b_lo_hi, v_kBVWeight); + v_hi_lo = vmlaq_s32(v_hi_lo, b_hi_lo, v_kBVWeight); + v_hi_hi = vmlaq_s32(v_hi_hi, b_hi_hi, v_kBVWeight); + + // Normalize + v_lo_lo = vshrq_n_s32(v_lo_lo, kWeightScale); + v_lo_hi = vshrq_n_s32(v_lo_hi, kWeightScale); + v_hi_lo = vshrq_n_s32(v_hi_lo, kWeightScale); + v_hi_hi = vshrq_n_s32(v_hi_hi, kWeightScale); + + // Pack and shuffle + output.val[0] = vreinterpretq_u8_s32(v_lo_lo); + output.val[1] = vreinterpretq_u8_s32(v_lo_hi); + output.val[2] = vreinterpretq_u8_s32(v_hi_lo); + output.val[3] = vreinterpretq_u8_s32(v_hi_hi); + + v = vqtbl4q_u8(output, index); + } + + static void load_rgb(uint8x16_t &b0, uint8x16_t &b1, uint8x16_t &g0, + uint8x16_t &g1, uint8x16_t &r0, uint8x16_t &r1, + const uint8_t *src_row, size_t w) { + // Load 32 pixels: two vectors of interleaved channels + const size_t vsize = 16; + if constexpr (ALPHA) { + // 4-channel input (e.g., RGBA or BGRA) + uint8x16x4_t vsrc0 = vld4q_u8(src_row + 8 * w); + uint8x16x4_t vsrc1 = vld4q_u8(src_row + 8 * w + 4 * vsize); + + b0 = vsrc0.val[0]; + g0 = vsrc0.val[1]; + r0 = vsrc0.val[2]; + + b1 = vsrc1.val[0]; + g1 = vsrc1.val[1]; + r1 = vsrc1.val[2]; + } else { + // 3-channel input (e.g., RGB or BGR) + uint8x16x3_t vsrc0 = vld3q_u8(src_row + 6 * w); + uint8x16x3_t vsrc1 = vld3q_u8(src_row + 6 * w + 3 * vsize); + + b0 = vsrc0.val[0]; + g0 = vsrc0.val[1]; + r0 = vsrc0.val[2]; + + b1 = vsrc1.val[0]; + g1 = vsrc1.val[1]; + r1 = vsrc1.val[2]; + } + + // Swap R and B channels if in RGB format + if constexpr (RGB) { + std::swap(b0, r0); + std::swap(b1, r1); + } + } +}; + +} // namespace kleidicv::neon + +#endif // KLEIDICV_RGB_TO_YUV420_H diff --git a/kleidicv/src/conversions/rgb_to_yuv420p_api.cpp b/kleidicv/src/conversions/rgb_to_yuv420p_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3e65fd54a0395e7b4d05b77281d8ab39838a2198 --- /dev/null +++ b/kleidicv/src/conversions/rgb_to_yuv420p_api.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/conversions/rgb_to_yuv420p.h" +#include "kleidicv/dispatch.h" +#include "kleidicv/kleidicv.h" + +#define KLEIDICV_DEFINE_C_API(name, partialname) \ + KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::partialname, nullptr, \ + nullptr, nullptr) + +KLEIDICV_DEFINE_C_API(kleidicv_rgb_to_yuv420p_stripe_u8, + rgb_to_yuv420p_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_rgba_to_yuv420p_stripe_u8, + rgba_to_yuv420p_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_bgr_to_yuv420p_stripe_u8, + bgr_to_yuv420p_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_bgra_to_yuv420p_stripe_u8, + bgra_to_yuv420p_stripe_u8); + +extern "C" { + +kleidicv_error_t kleidicv_rgb_to_yuv420p_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_rgb_to_yuv420p_stripe_u8(src, src_stride, dst, dst_stride, + width, height, is_nv21, 0, height); +} + +kleidicv_error_t kleidicv_rgba_to_yuv420p_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_rgba_to_yuv420p_stripe_u8(src, src_stride, dst, dst_stride, + width, height, is_nv21, 0, height); +} + +kleidicv_error_t kleidicv_bgr_to_yuv420p_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_bgr_to_yuv420p_stripe_u8(src, src_stride, dst, dst_stride, + width, height, is_nv21, 0, height); +} + +kleidicv_error_t kleidicv_bgra_to_yuv420p_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_bgra_to_yuv420p_stripe_u8(src, src_stride, dst, dst_stride, + width, height, is_nv21, 0, height); +} + +} // extern "C" diff --git a/kleidicv/src/conversions/rgb_to_yuv420p_neon.cpp b/kleidicv/src/conversions/rgb_to_yuv420p_neon.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ec4f709daa674314c6d9a751747157b57a8262f0 --- /dev/null +++ b/kleidicv/src/conversions/rgb_to_yuv420p_neon.cpp @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/conversions/rgb_to_yuv420p.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/neon.h" +#include "rgb_to_yuv420_neon.h" + +namespace kleidicv::neon { + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t rgb_to_yuv420p_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool is_nv21, size_t begin, + size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, (height * 3 + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + uint8_t *uv_dst = dst + dst_stride * height; + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, dst, dst_stride, uv_dst, dst_stride, width, height, + is_nv21, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t rgba_to_yuv420p_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, (height * 3 + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + uint8_t *uv_dst = dst + dst_stride * height; + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, dst, dst_stride, uv_dst, dst_stride, width, height, + is_nv21, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t bgr_to_yuv420p_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool is_nv21, size_t begin, + size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, (height * 3 + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + uint8_t *uv_dst = dst + dst_stride * height; + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, dst, dst_stride, uv_dst, dst_stride, width, height, + is_nv21, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t bgra_to_yuv420p_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, (height * 3 + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + uint8_t *uv_dst = dst + dst_stride * height; + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, dst, dst_stride, uv_dst, dst_stride, width, height, + is_nv21, begin, end); +} + +} // namespace kleidicv::neon diff --git a/kleidicv/src/conversions/rgb_to_yuv420sp_api.cpp b/kleidicv/src/conversions/rgb_to_yuv420sp_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..30c9eacbea9711c560b06b95d3798a8e4e36eb16 --- /dev/null +++ b/kleidicv/src/conversions/rgb_to_yuv420sp_api.cpp @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/conversions/rgb_to_yuv420sp.h" +#include "kleidicv/dispatch.h" +#include "kleidicv/kleidicv.h" + +#define KLEIDICV_DEFINE_C_API(name, partialname) \ + KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::partialname, nullptr, \ + nullptr, nullptr) + +KLEIDICV_DEFINE_C_API(kleidicv_rgb_to_yuv420sp_stripe_u8, + rgb_to_yuv420sp_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_rgba_to_yuv420sp_stripe_u8, + rgba_to_yuv420sp_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_bgr_to_yuv420sp_stripe_u8, + bgr_to_yuv420sp_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_bgra_to_yuv420sp_stripe_u8, + bgra_to_yuv420sp_stripe_u8); + +extern "C" { + +kleidicv_error_t kleidicv_rgb_to_yuv420sp_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_rgb_to_yuv420sp_stripe_u8(src, src_stride, y_dst, y_stride, + uv_dst, uv_stride, width, height, + is_nv21, 0, height); +} + +kleidicv_error_t kleidicv_rgba_to_yuv420sp_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_rgba_to_yuv420sp_stripe_u8(src, src_stride, y_dst, y_stride, + uv_dst, uv_stride, width, height, + is_nv21, 0, height); +} + +kleidicv_error_t kleidicv_bgr_to_yuv420sp_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_bgr_to_yuv420sp_stripe_u8(src, src_stride, y_dst, y_stride, + uv_dst, uv_stride, width, height, + is_nv21, 0, height); +} + +kleidicv_error_t kleidicv_bgra_to_yuv420sp_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21) { + return kleidicv_bgra_to_yuv420sp_stripe_u8(src, src_stride, y_dst, y_stride, + uv_dst, uv_stride, width, height, + is_nv21, 0, height); +} + +} // extern "C" diff --git a/kleidicv/src/conversions/rgb_to_yuv420sp_neon.cpp b/kleidicv/src/conversions/rgb_to_yuv420sp_neon.cpp new file mode 100644 index 0000000000000000000000000000000000000000..920aee3d73938a840947f6277e083e7efc2f074f --- /dev/null +++ b/kleidicv/src/conversions/rgb_to_yuv420sp_neon.cpp @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/conversions/rgb_to_yuv420sp.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/neon.h" +#include "rgb_to_yuv420_neon.h" + +namespace kleidicv::neon { + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t rgb_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(y_dst, y_stride, height); + CHECK_POINTER_AND_STRIDE(uv_dst, uv_stride, (height + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t rgba_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(y_dst, y_stride, height); + CHECK_POINTER_AND_STRIDE(uv_dst, uv_stride, (height + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t bgr_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(y_dst, y_stride, height); + CHECK_POINTER_AND_STRIDE(uv_dst, uv_stride, (height + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t bgra_to_yuv420sp_stripe_u8(const uint8_t *src, + size_t src_stride, uint8_t *y_dst, + size_t y_stride, uint8_t *uv_dst, + size_t uv_stride, size_t width, + size_t height, bool is_nv21, + size_t begin, size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(y_dst, y_stride, height); + CHECK_POINTER_AND_STRIDE(uv_dst, uv_stride, (height + 1) / 2); + CHECK_IMAGE_SIZE(width, height); + return RGBxorBGRxToYUV420::rgb2yuv420p_operation( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, begin, end); +} + +} // namespace kleidicv::neon diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 2a6bcebe8dc3ea6a79e35af93baba72347b2f111..b1e2e476b88ac5e4eca72275ad592fad15f26e13 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -76,6 +76,74 @@ KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_f32_to_u8, float, uint8_t); KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_s8_to_f32, int8_t, float); KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_u8_to_f32, uint8_t, float); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_rgb_to_yuv420p_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_rgb_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_rgba_to_yuv420p_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_rgba_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_bgr_to_yuv420p_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_bgr_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_bgra_to_yuv420p_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_bgra_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_rgb_to_yuv420sp_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_rgb_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_rgba_to_yuv420sp_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_rgba_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_bgr_to_yuv420sp_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_bgr_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_bgra_to_yuv420sp_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_bgra_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_yuv_sp_to_bgr_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index aef62654e8285d0f03f6b7a339b8b011fe82e7dd..c4990700db23cdaa2d3d2f7997bcc2de753a1191 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -13,6 +13,8 @@ #include "kleidicv/arithmetics/rotate.h" #include "kleidicv/arithmetics/scale.h" +#include "kleidicv/conversions/rgb_to_yuv420p.h" +#include "kleidicv/conversions/rgb_to_yuv420sp.h" #include "kleidicv/ctypes.h" #include "kleidicv/filters/blur_and_downsample.h" #include "kleidicv/filters/gaussian_blur.h" @@ -250,6 +252,102 @@ kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride, return parallel_batches(callback, mt, width, 64); } +kleidicv_error_t kleidicv_thread_rgb_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_rgb_to_yuv420p_stripe_u8( + src, src_stride, dst, dst_stride, width, height, is_nv21, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_rgba_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_rgba_to_yuv420p_stripe_u8( + src, src_stride, dst, dst_stride, width, height, is_nv21, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_bgr_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_bgr_to_yuv420p_stripe_u8( + src, src_stride, dst, dst_stride, width, height, is_nv21, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_bgra_to_yuv420p_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool is_nv21, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_bgra_to_yuv420p_stripe_u8( + src, src_stride, dst, dst_stride, width, height, is_nv21, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_rgb_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_rgb_to_yuv420sp_stripe_u8( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_rgba_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_rgba_to_yuv420sp_stripe_u8( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_bgr_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_bgr_to_yuv420sp_stripe_u8( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_bgra_to_yuv420sp_u8( + const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, + uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, + bool is_nv21, kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_bgra_to_yuv420sp_stripe_u8( + src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, + is_nv21, static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + template inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl( F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 4001aef9a2ff53fbadccba289701b0647f6c79b0..6b2199533031a8bae3c114583efffcf914892ba2 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -17,6 +17,11 @@ YUVSP2BGRA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2B YUVSP2RGB: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGB_NV12)' YUVSP2RGBA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGBA_NV12)' +RGB2YUVP: opencv_perf_imgproc '*cvtColorRGB2YUV420p/*' '($PIXEL_FORMAT, COLOR_RGB2YUV_YV12)' +RGBA2YUVP: opencv_perf_imgproc '*cvtColorRGB2YUV420p/*' '($PIXEL_FORMAT, COLOR_RGBA2YUV_YV12)' +BGR2YUVP: opencv_perf_imgproc '*cvtColorRGB2YUV420p/*' '($PIXEL_FORMAT, COLOR_BGR2YUV_YV12)' +BGRA2YUVP: opencv_perf_imgproc '*cvtColorRGB2YUV420p/*' '($PIXEL_FORMAT, COLOR_BGRA2YUV_YV12)' + RGB2YUV: opencv_perf_imgproc '*cvtColor8u/*' '($PIXEL_FORMAT, COLOR_RGB2YUV)' BGR2YUV: opencv_perf_imgproc '*cvtColor8u/*' '($PIXEL_FORMAT, COLOR_BGR2YUV)' RGBA2YUV: opencv_perf_imgproc '*cvtColor8u/*' '($PIXEL_FORMAT, CX_RGBA2YUV)' diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh index db777ecc3ecec9b8cd28ef02a85b4998aaa71e5a..98a49f83d1a218e6623732191c9f5ade099aa3b6 100755 --- a/scripts/ci-opencv.sh +++ b/scripts/ci-opencv.sh @@ -106,6 +106,10 @@ IMGPROC_TEST_PATTERNS=( '*Imgproc_ColorYUV*' '*Imgproc_cvtColor_BE.COLOR_YUV*' '*Imgproc_cvtColor_BE.COLOR_RGB2YUV' + '*Imgproc_cvtColor_BE.COLOR_RGB2YUV_YV12' + '*Imgproc_cvtColor_BE.COLOR_BGR2YUV_YV12' + '*Imgproc_cvtColor_BE.COLOR_RGBA2YUV_YV12' + '*Imgproc_cvtColor_BE.COLOR_BGRA2YUV_YV12' '*Imgproc_Threshold*' '*Imgproc_Morphology*' '*Imgproc_GaussianBlur*' diff --git a/test/api/test_rgb_to_yuv_420_p.cpp b/test/api/test_rgb_to_yuv_420_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..20fbce0f7c896263644a45fb3bec6ab4525572fb --- /dev/null +++ b/test/api/test_rgb_to_yuv_420_p.cpp @@ -0,0 +1,238 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "test_config.h" + +class RGB2YUV420pTest : public testing::Test { + public: + struct TestParams { + size_t width; + size_t src_padding; + size_t dst_padding; + size_t height; + size_t channels; + bool is_nv21; + bool is_rgb; + }; + + static std::vector generate_test_cases( + const std::vector& widths, + const std::vector& src_paddings, + const std::vector& dst_paddings, + const std::vector& heights, const std::vector& channels, + const std::vector& uv_cases, + const std::vector& output_image_case) { + std::vector cases; + + for (size_t w : widths) { + for (size_t src_pad : src_paddings) { + for (size_t dst_pad : dst_paddings) { + for (size_t h : heights) { + for (size_t c : channels) { + for (bool uv_case : uv_cases) { + for (bool is_rgb : output_image_case) { + cases.push_back({w, src_pad, dst_pad, h, c, uv_case, is_rgb}); + } + } + } + } + } + } + } + + return cases; + } + + static std::vector get_test_cases() { + std::vector widths = {1, 2, 4, 6, 18, 27, 32, 64, 3}; + std::vector src_paddings = {0}; + std::vector dst_paddings = {0}; + std::vector heights = {2, 5, 11, 16}; + std::vector channels = {3, 4}; + std::vector uv_cases = {true, false}; + std::vector output_image_case = {true, false}; + return generate_test_cases(widths, src_paddings, dst_paddings, heights, + channels, uv_cases, output_image_case); + } + + void run_test_case(const TestParams& params) { + test::Array2D src{params.width * params.channels, params.height, + params.src_padding, params.channels}; + + test::Array2D expected_dst{ + params.width, (params.height * 3 + 1) / 2, params.dst_padding}; + + test::Array2D dst{params.width, (params.height * 3 + 1) / 2, + params.dst_padding}; + + test::PseudoRandomNumberGenerator input_value_random_range; + src.fill(input_value_random_range); + + calculate_reference(src.data(), src.stride(), expected_dst.data(), + expected_dst.stride(), params.width, params.height, + params.is_nv21, params.is_rgb, params.channels); + + auto status = KLEIDICV_OK; + + if (params.channels == 3) { + if (params.is_rgb) { + status = kleidicv_rgb_to_yuv420p_u8( + src.data(), src.stride(), dst.data(), dst.stride(), params.width, + params.height, params.is_nv21); + } else { + status = kleidicv_bgr_to_yuv420p_u8( + src.data(), src.stride(), dst.data(), dst.stride(), params.width, + params.height, params.is_nv21); + } + } + + if (params.channels == 4) { + if (params.is_rgb) { + status = kleidicv_rgba_to_yuv420p_u8( + src.data(), src.stride(), dst.data(), dst.stride(), params.width, + params.height, params.is_nv21); + } else { + status = kleidicv_bgra_to_yuv420p_u8( + src.data(), src.stride(), dst.data(), dst.stride(), params.width, + params.height, params.is_nv21); + } + } + + EXPECT_EQ(KLEIDICV_OK, status); + EXPECT_EQ_ARRAY2D(expected_dst, dst); + } + + template + void run_unsupported(Func impl, size_t channels, bool is_nv21) { + test::Array2D src{20 * channels, 10, 0, channels}; + test::Array2D dst{20, (10 * 3 + 1) / 2}; + + test::test_null_args(impl, src.data(), src.stride(), dst.data(), + dst.stride(), dst.width(), dst.height(), is_nv21); + + EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), dst.data(), + dst.stride(), 0, 1, is_nv21)); + + EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), dst.data(), + dst.stride(), 1, 0, is_nv21)); + + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, is_nv21)); + EXPECT_EQ( + KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, is_nv21)); + } + + private: + // Coefficients for RGB to YUV420p conversion + static const int kWeightScale = 20; + static const int kRYWeight = + 269484; // 0.299055 * (236-16)/256 * (1 << kWeightScale) + static const int kGYWeight = + 528482; // 0.586472 * (236-16)/256 * (1 << kWeightScale) + static const int kBYWeight = + 102760; // 0.114035 * (236-16)/256 * (1 << kWeightScale) + static const int kRUWeight = -155188; // -0.148 * (1 << (kWeightScale-1)) + static const int kGUWeight = -305135; // -0.291 * (1 << (kWeightScale-1)) + static const int kBUWeight = 460324; // 0.439 * (1 << (kWeightScale-1)) + static const int kGVWeight = -385875; // -0.368 * (1 << (kWeightScale-1)) + static const int kBVWeight = -74448; // -0.071 * (1 << (kWeightScale-1)) + static uint8_t saturate_cast_s32_to_u8(int32_t rhs) { + return static_cast( + std::min(std::max(0, rhs), + static_cast(std::numeric_limits::max()))); + } + uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) { + const int shifted16 = (16 << kWeightScale); + const int halfShift = (1 << (kWeightScale - 1)); + int yy = + kRYWeight * r + kGYWeight * g + kBYWeight * b + halfShift + shifted16; + + return std::clamp(yy >> kWeightScale, 0, 0xff); + } + + static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t& u, + uint8_t& v) { + const int halfShift = (1 << (kWeightScale - 1)); + const int shifted128 = (128 << kWeightScale); + int uu = + kRUWeight * r + kGUWeight * g + kBUWeight * b + halfShift + shifted128; + int vv = + kBUWeight * r + kGVWeight * g + kBVWeight * b + halfShift + shifted128; + + u = std::clamp(uu >> kWeightScale, 0, 0xff); + v = std::clamp(vv >> kWeightScale, 0, 0xff); + } + void calculate_reference(const uint8_t* src, size_t src_stride, uint8_t* dst, + size_t dst_stride, size_t width, size_t height, + bool is_nv21, bool RGB, size_t channels) { + const uint8_t* src_row = nullptr; + uint8_t* uv_data = dst + dst_stride * height; + uint8_t* y_row = nullptr; + uint8_t* u_row = nullptr; + uint8_t* v_row = nullptr; + + for (size_t h = 0; h < height; h++) { + src_row = src + src_stride * h; + y_row = dst + dst_stride * h; + + bool evenRow = (h % 2) == 0; + if (evenRow) { + u_row = uv_data + dst_stride * (h / 4) + ((h / 2) % 2) * (width / 2); + v_row = uv_data + dst_stride * ((h + height + 1) / 4) + + (((h + height + 1) / 2) % 2) * (width / 2); + } + + for (size_t w = 0; w < width; w++) { + uint8_t b0{}, g0{}, r0{}; + b0 = src_row[w * channels + 0]; + g0 = src_row[w * channels + 1]; + r0 = src_row[w * channels + 2]; + if (RGB) { + std::swap(b0, r0); + } + uint8_t y0 = rgb_to_y(r0, g0, b0); + y_row[w] = y0; + bool evenCol = (w % 2) == 0; + if (evenRow && evenCol) { + uint8_t uu{}, vv{}; + rgb_to_uv(r0, g0, b0, uu, vv); + if (is_nv21) { + std::swap(uu, vv); + } + u_row[w >> 1] = uu; + v_row[w >> 1] = vv; + } + } + } + } +}; + +TEST_F(RGB2YUV420pTest, ConvertspaddedInputsWithAllParamCombinations) { + for (const auto& params : get_test_cases()) { + run_test_case(params); + } +} + +TEST_F(RGB2YUV420pTest, ReturnsErrorForUnsupportedCombinations) { + run_unsupported(kleidicv_rgb_to_yuv420p_u8, 3, true); + run_unsupported(kleidicv_rgba_to_yuv420p_u8, 4, true); + run_unsupported(kleidicv_bgr_to_yuv420p_u8, 3, true); + run_unsupported(kleidicv_bgra_to_yuv420p_u8, 4, true); + run_unsupported(kleidicv_rgb_to_yuv420p_u8, 3, false); + run_unsupported(kleidicv_rgba_to_yuv420p_u8, 4, false); + run_unsupported(kleidicv_bgr_to_yuv420p_u8, 3, false); + run_unsupported(kleidicv_bgra_to_yuv420p_u8, 4, false); +} diff --git a/test/api/test_rgb_to_yuv_420_sp.cpp b/test/api/test_rgb_to_yuv_420_sp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e15530e435814a189d7c918c2965e58e81c50d3 --- /dev/null +++ b/test/api/test_rgb_to_yuv_420_sp.cpp @@ -0,0 +1,256 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "framework/utils.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/utils.h" +#include "test_config.h" + +class RGB2YUV420SpTest : public testing::Test { + public: + struct TestParams { + size_t width; + size_t src_padding; + size_t dst_padding; + size_t height; + size_t channels; + bool is_nv21; + bool is_rgb; + }; + + static std::vector generate_test_cases( + const std::vector& widths, + const std::vector& src_paddings, + const std::vector& dst_paddings, + const std::vector& heights, const std::vector& channels, + const std::vector& uv_cases, + const std::vector& output_image_case) { + std::vector cases; + + for (size_t w : widths) { + for (size_t src_pad : src_paddings) { + for (size_t dst_pad : dst_paddings) { + for (size_t h : heights) { + for (size_t c : channels) { + for (bool uv_case : uv_cases) { + for (bool is_rgb : output_image_case) { + cases.push_back({w, src_pad, dst_pad, h, c, uv_case, is_rgb}); + } + } + } + } + } + } + } + + return cases; + } + + static std::vector get_test_cases() { + std::vector widths = {1, 2, 4, 6, 18, 27, 32, 64, 3}; + std::vector src_paddings = {0}; + std::vector dst_paddings = {0}; + std::vector heights = {2, 5, 11, 16}; + std::vector channels = {3, 4}; + std::vector uv_cases = {true, false}; + std::vector output_image_case = {true, false}; + return generate_test_cases(widths, src_paddings, dst_paddings, heights, + channels, uv_cases, output_image_case); + } + + void run_test_case(const TestParams& params) { + test::Array2D src{params.width * params.channels, params.height, + params.src_padding, params.channels}; + + test::Array2D expected_y_dst{params.width, params.height, + params.dst_padding}; + + test::Array2D expected_uv_dst{ + KLEIDICV_TARGET_NAMESPACE::align_up(params.width, 2), + (params.height + 1) / 2, params.dst_padding}; + + test::Array2D y_dst{params.width, params.height, + params.dst_padding}; + + test::Array2D uv_dst{ + KLEIDICV_TARGET_NAMESPACE::align_up(params.width, 2), + (params.height + 1) / 2, params.dst_padding}; + + test::PseudoRandomNumberGenerator input_value_random_range; + src.fill(input_value_random_range); + + calculate_reference(src.data(), src.stride(), expected_y_dst.data(), + expected_y_dst.stride(), expected_uv_dst.data(), + expected_uv_dst.stride(), params.width, params.height, + params.is_nv21, params.is_rgb, params.channels); + + auto status = KLEIDICV_OK; + + if (params.channels == 3) { + if (params.is_rgb) { + status = kleidicv_rgb_to_yuv420sp_u8( + src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), params.width, params.height, + params.is_nv21); + } else { + status = kleidicv_bgr_to_yuv420sp_u8( + src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), params.width, params.height, + params.is_nv21); + } + } + + if (params.channels == 4) { + if (params.is_rgb) { + status = kleidicv_rgba_to_yuv420sp_u8( + src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), params.width, params.height, + params.is_nv21); + } else { + status = kleidicv_bgra_to_yuv420sp_u8( + src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), params.width, params.height, + params.is_nv21); + } + } + + EXPECT_EQ(KLEIDICV_OK, status); + EXPECT_EQ_ARRAY2D(expected_y_dst, y_dst); + EXPECT_EQ_ARRAY2D(expected_uv_dst, uv_dst); + } + + template + void run_unsupported(Func impl, size_t channels, bool is_nv21) { + test::Array2D src{20 * channels, 10, 0, channels}; + test::Array2D y_dst{20, 10}; + test::Array2D uv_dst{20, 5}; + + test::test_null_args(impl, src.data(), src.stride(), y_dst.data(), + y_dst.stride(), uv_dst.data(), uv_dst.stride(), + src.width(), src.height(), is_nv21); + + EXPECT_EQ(KLEIDICV_OK, + impl(src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), 0, 1, is_nv21)); + + EXPECT_EQ(KLEIDICV_OK, + impl(src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), 1, 0, is_nv21)); + + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, is_nv21)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), y_dst.data(), y_dst.stride(), + uv_dst.data(), uv_dst.stride(), KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS, is_nv21)); + } + + private: + // Coefficients for RGB to YUV420p conversion + static const int kWeightScale = 20; + static const int kRYWeight = + 269484; // 0.299055 * (236-16)/256 * (1 << kWeightScale) + static const int kGYWeight = + 528482; // 0.586472 * (236-16)/256 * (1 << kWeightScale) + static const int kBYWeight = + 102760; // 0.114035 * (236-16)/256 * (1 << kWeightScale) + static const int kRUWeight = -155188; // -0.148 * (1 << (kWeightScale-1)) + static const int kGUWeight = -305135; // -0.291 * (1 << (kWeightScale-1)) + static const int kBUWeight = 460324; // 0.439 * (1 << (kWeightScale-1)) + static const int kGVWeight = -385875; // -0.368 * (1 << (kWeightScale-1)) + static const int kBVWeight = -74448; // -0.071 * (1 << (kWeightScale-1)) + static uint8_t saturate_cast_s32_to_u8(int32_t rhs) { + return static_cast( + std::min(std::max(0, rhs), + static_cast(std::numeric_limits::max()))); + } + uint8_t rgb_to_y(uint8_t r, uint8_t g, uint8_t b) { + const int shifted16 = (16 << kWeightScale); + const int halfShift = (1 << (kWeightScale - 1)); + int yy = + kRYWeight * r + kGYWeight * g + kBYWeight * b + halfShift + shifted16; + + return std::clamp(yy >> kWeightScale, 0, 0xff); + } + + static void rgb_to_uv(uint8_t r, uint8_t g, uint8_t b, uint8_t& u, + uint8_t& v) { + const int halfShift = (1 << (kWeightScale - 1)); + const int shifted128 = (128 << kWeightScale); + int uu = + kRUWeight * r + kGUWeight * g + kBUWeight * b + halfShift + shifted128; + int vv = + kBUWeight * r + kGVWeight * g + kBVWeight * b + halfShift + shifted128; + + u = std::clamp(uu >> kWeightScale, 0, 0xff); + v = std::clamp(vv >> kWeightScale, 0, 0xff); + } + void calculate_reference(const uint8_t* src, size_t src_stride, + uint8_t* y_dst, size_t y_stride, uint8_t* uv_dst, + size_t uv_stride, size_t width, size_t height, + bool is_nv21, bool RGB, size_t channels) { + const uint8_t* src_row = nullptr; + uint8_t* y_row = nullptr; + uint8_t* u_row = nullptr; + + for (size_t h = 0; h < height; h++) { + src_row = src + src_stride * h; + y_row = y_dst + y_stride * h; + + bool evenRow = (h % 2) == 0; + if (evenRow) { + u_row = uv_dst + uv_stride * (h / 2); + } + + for (size_t w = 0; w < width; w++) { + uint8_t b0{}, g0{}, r0{}; + b0 = src_row[w * channels + 0]; + g0 = src_row[w * channels + 1]; + r0 = src_row[w * channels + 2]; + if (RGB) { + std::swap(b0, r0); + } + uint8_t y0 = rgb_to_y(r0, g0, b0); + y_row[w] = y0; + bool evenCol = (w % 2) == 0; + if (evenRow && evenCol) { + uint8_t uu{}, vv{}; + rgb_to_uv(r0, g0, b0, uu, vv); + if (is_nv21) { + std::swap(uu, vv); + } + u_row[w + 0] = uu; + u_row[w + 1] = vv; + } + } + } + } +}; + +TEST_F(RGB2YUV420SpTest, ConvertspaddedInputsWithAllParamCombinations) { + for (const auto& params : get_test_cases()) { + run_test_case(params); + } +} + +TEST_F(RGB2YUV420SpTest, ReturnsErrorForUnsupportedCombinations) { + run_unsupported(kleidicv_rgb_to_yuv420sp_u8, 3, true); + run_unsupported(kleidicv_rgba_to_yuv420sp_u8, 4, true); + run_unsupported(kleidicv_bgr_to_yuv420sp_u8, 3, true); + run_unsupported(kleidicv_bgra_to_yuv420sp_u8, 4, true); + run_unsupported(kleidicv_rgb_to_yuv420sp_u8, 3, false); + run_unsupported(kleidicv_rgba_to_yuv420sp_u8, 4, false); + run_unsupported(kleidicv_bgr_to_yuv420sp_u8, 3, false); + run_unsupported(kleidicv_bgra_to_yuv420sp_u8, 4, false); +} diff --git a/test/api/test_thread_rgb_to_yuv_p.cpp b/test/api/test_thread_rgb_to_yuv_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ba31e1b9c1b8d8db03bdc5d97996fc22d0995ad6 --- /dev/null +++ b/test/api/test_thread_rgb_to_yuv_p.cpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" + +// Tuple of width, height, thread count. +typedef std::tuple P; + +class RgbToYuv420Thread : public testing::TestWithParam

{ + public: + template + void check(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, size_t channels) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + test::Array2D src(size_t{width} * channels, height), + dst_single(width, (height * 3 + 1) / 2), + dst_multi(width, (height * 3 + 1) / 2); + + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + + kleidicv_error_t single_result = + single_threaded_func(src.data(), src.stride(), dst_single.data(), + dst_single.stride(), width, height, false); + + kleidicv_error_t multi_result = multithreaded_func( + src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width, + height, false, get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); + } +}; + +TEST_P(RgbToYuv420Thread, FromBGR) { + check(kleidicv_bgr_to_yuv420p_u8, kleidicv_thread_bgr_to_yuv420p_u8, 3); +} +TEST_P(RgbToYuv420Thread, FromBGRA) { + check(kleidicv_bgra_to_yuv420p_u8, kleidicv_thread_bgra_to_yuv420p_u8, 4); +} +TEST_P(RgbToYuv420Thread, FromRGB) { + check(kleidicv_rgb_to_yuv420p_u8, kleidicv_thread_rgb_to_yuv420p_u8, 3); +} +TEST_P(RgbToYuv420Thread, FromRGBA) { + check(kleidicv_rgba_to_yuv420p_u8, kleidicv_thread_rgba_to_yuv420p_u8, 4); +} + +INSTANTIATE_TEST_SUITE_P(, RgbToYuv420Thread, + testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, + P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, + P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, + P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5}, + P{12, 37, 5}, P{2, 1000, 2})); diff --git a/test/api/test_thread_rgb_to_yuv_sp.cpp b/test/api/test_thread_rgb_to_yuv_sp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a47c9d76361ed296208554a6c60d3e5f5fed9807 --- /dev/null +++ b/test/api/test_thread_rgb_to_yuv_sp.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "framework/utils.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/utils.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" +#include "test_config.h" + +// Tuple of width, height, thread count. +typedef std::tuple P; + +class RgbToYuv420SpThread : public testing::TestWithParam

{ + public: + template + void check(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, size_t channels) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + test::Array2D src(size_t{width} * channels, height), + y_dst_single(width, height), + uv_dst_single(KLEIDICV_TARGET_NAMESPACE::align_up(width, 2), + (height + 1) / 2), + y_dst_multi(width, height), + uv_dst_multi(KLEIDICV_TARGET_NAMESPACE::align_up(width, 2), + (height + 1) / 2); + + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + + kleidicv_error_t single_result = single_threaded_func( + src.data(), src.stride(), y_dst_single.data(), y_dst_single.stride(), + uv_dst_single.data(), uv_dst_single.stride(), width, height, false); + + kleidicv_error_t multi_result = multithreaded_func( + src.data(), src.stride(), y_dst_multi.data(), y_dst_multi.stride(), + uv_dst_multi.data(), uv_dst_multi.stride(), width, height, false, + get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(y_dst_multi, y_dst_single); + EXPECT_EQ_ARRAY2D(uv_dst_multi, uv_dst_single); + } +}; + +TEST_P(RgbToYuv420SpThread, FromBGR) { + check(kleidicv_bgr_to_yuv420sp_u8, kleidicv_thread_bgr_to_yuv420sp_u8, 3); +} +TEST_P(RgbToYuv420SpThread, FromBGRA) { + check(kleidicv_bgra_to_yuv420sp_u8, kleidicv_thread_bgra_to_yuv420sp_u8, 4); +} +TEST_P(RgbToYuv420SpThread, FromRGB) { + check(kleidicv_rgb_to_yuv420sp_u8, kleidicv_thread_rgb_to_yuv420sp_u8, 3); +} +TEST_P(RgbToYuv420SpThread, FromRGBA) { + check(kleidicv_rgba_to_yuv420sp_u8, kleidicv_thread_rgba_to_yuv420sp_u8, 4); +} + +INSTANTIATE_TEST_SUITE_P(, RgbToYuv420SpThread, + testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, + P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, + P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, + P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5}, + P{12, 37, 5}, P{2, 1000, 2}));