From 26e326ef806db812badb7752e0cbd5c5df3e05ef Mon Sep 17 00:00:00 2001 From: Noureldin Abdelfattah Date: Tue, 5 Aug 2025 15:32:36 +0100 Subject: [PATCH] Add YUV420p to RGBx & BGRx --- CHANGELOG.md | 1 + adapters/opencv/kleidicv_hal.cpp | 36 +++ adapters/opencv/kleidicv_hal.h | 15 + benchmark/benchmark.cpp | 35 ++- conformity/opencv/test_cvtcolor.cpp | 13 +- doc/functionality.md | 44 +-- doc/opencv.md | 13 +- .../{yuv_sp_to_rgb.h => yuv_420_to_rgb.h} | 107 ++++++- kleidicv/include/kleidicv/kleidicv.h | 57 ++++ kleidicv/src/conversions/yuv420_to_rgb_neon.h | 260 ++++++++++++++++++ kleidicv/src/conversions/yuv420_to_rgb_sc.h | 158 +++++++++++ kleidicv/src/conversions/yuv_p_to_rgb_api.cpp | 58 ++++ .../src/conversions/yuv_p_to_rgb_neon.cpp | 248 +++++++++++++++++ kleidicv/src/conversions/yuv_p_to_rgb_sc.h | 231 ++++++++++++++++ kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp | 40 +++ .../src/conversions/yuv_p_to_rgb_sve2.cpp | 43 +++ .../src/conversions/yuv_sp_to_rgb_api.cpp | 2 +- .../src/conversions/yuv_sp_to_rgb_neon.cpp | 248 +---------------- kleidicv/src/conversions/yuv_sp_to_rgb_sc.h | 151 +--------- .../include/kleidicv_thread/kleidicv_thread.h | 32 +++ kleidicv_thread/src/kleidicv_thread.cpp | 49 ++++ scripts/benchmark/benchmarks.txt | 4 + test/api/test_thread_yuv_p_to_rgb.cpp | 105 +++++++ test/api/test_thread_yuv_sp_to_rgb.cpp | 62 ++++- test/api/test_yuv_p_to_rgb.cpp | 216 +++++++++++++++ 25 files changed, 1810 insertions(+), 418 deletions(-) rename kleidicv/include/kleidicv/conversions/{yuv_sp_to_rgb.h => yuv_420_to_rgb.h} (51%) create mode 100644 kleidicv/src/conversions/yuv420_to_rgb_neon.h create mode 100644 kleidicv/src/conversions/yuv420_to_rgb_sc.h create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_api.cpp create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_sc.h create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp create mode 100644 test/api/test_thread_yuv_p_to_rgb.cpp create mode 100644 test/api/test_yuv_p_to_rgb.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 540529b88..3bac3898d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ This changelog aims to follow the guiding principles of - Gaussian Blur for any odd kernel size (up to 255x255) with replicated borders - Conversion from packed YUV 4:4:4 (interleaved and non-subsampled) to RGBA/BGRA. - Add SME2 version of saturating add with multivector loads and stores. It is marked as experimental as it is not covered by CI as of now. +- Conversion from YUV 4:2:0 planar (I420/YV12) to RGBA/BGRA. ### Changed - Performance of Gaussian Blur is greatly improved in return for some accuracy. diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 7fa9209f0..6f8af0de2 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -241,6 +241,42 @@ int yuv_to_bgr_sp_ex(const uchar *y_data, size_t y_step, const uchar *uv_data, return CV_HAL_ERROR_NOT_IMPLEMENTED; } +int yuv_to_bgr_p(const uchar *src_data, size_t src_step, uchar *dst_data, + size_t dst_step, int dst_width, int dst_height, int dcn, + bool swapBlue, int uIdx) { + const bool is_bgr = !swapBlue; + const bool is_nv21 = (uIdx != 0); + auto mt = get_multithreading(); + + if (dcn == 3) { + if (is_bgr) { + return convert_error(kleidicv_thread_yuv_p_to_bgr_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, dst_width, + dst_height, is_nv21, mt)); + } + return convert_error(kleidicv_thread_yuv_p_to_rgb_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, dst_width, dst_height, + is_nv21, mt)); + } + + if (dcn == 4) { + if (is_bgr) { + return convert_error(kleidicv_thread_yuv_p_to_bgra_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, dst_width, + dst_height, is_nv21, mt)); + } + return convert_error(kleidicv_thread_yuv_p_to_rgba_u8( + reinterpret_cast(src_data), src_step, + reinterpret_cast(dst_data), dst_step, dst_width, dst_height, + is_nv21, mt)); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr) { diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h index 0c9cd5e6b..4d3c95989 100644 --- a/adapters/opencv/kleidicv_hal.h +++ b/adapters/opencv/kleidicv_hal.h @@ -38,6 +38,10 @@ int yuv_to_bgr_sp_ex(const uchar *y_data, size_t y_step, const uchar *uv_data, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx); +int yuv_to_bgr_p(const uchar *src_data, size_t src_step, uchar *dst_data, + size_t dst_step, int dst_width, int dst_height, int dcn, + bool swapBlue, int uIdx); + int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr); @@ -238,6 +242,17 @@ static inline int kleidicv_yuv_to_bgr_sp_ex_with_fallback( #undef cv_hal_cvtTwoPlaneYUVtoBGREx #define cv_hal_cvtTwoPlaneYUVtoBGREx kleidicv_yuv_to_bgr_sp_ex_with_fallback +// yuv_to_bgr_p +static inline int kleidicv_yuv_to_bgr_p_with_fallback( + const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, + int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) { + return KLEIDICV_HAL_FALLBACK_FORWARD( + yuv_to_bgr_p, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, + dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); +} +#undef cv_hal_cvtThreePlaneYUVtoBGR +#define cv_hal_cvtThreePlaneYUVtoBGR kleidicv_yuv_to_bgr_p_with_fallback + // yuv_to_bgr static inline int kleidicv_yuv_to_bgr_with_fallback( const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 218fb8902..6ee20f15d 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -715,10 +715,11 @@ template static void yuv_sp(Function f, benchmark::State& state) { bench_functor(state, [f]() { (void)f(get_source_buffer_a(), image_width * sizeof(uint8_t), - get_source_buffer_b(), + get_source_buffer_b(), (image_width / 2) * sizeof(uint8_t), get_destination_buffer_a(), - image_width * sizeof(uint8_t), image_width, image_height, true); + image_width * OutChannels * sizeof(uint8_t), image_width, + image_height, true); }); } @@ -742,6 +743,36 @@ static void yuv_sp_to_bgra(benchmark::State& state) { } BENCHMARK(yuv_sp_to_bgra); +template +static void yuv_p(Function f, benchmark::State& state) { + bench_functor(state, [f]() { + (void)f(get_source_buffer_a(), image_width * sizeof(uint8_t), + get_destination_buffer_a(), + image_width * OutChannels * sizeof(uint8_t), image_width, + image_height, true); + }); +} + +static void yuv_p_to_rgb(benchmark::State& state) { + yuv_p<3>(kleidicv_yuv_p_to_rgb_u8, state); +} +BENCHMARK(yuv_p_to_rgb); + +static void yuv_p_to_bgr(benchmark::State& state) { + yuv_p<3>(kleidicv_yuv_p_to_bgr_u8, state); +} +BENCHMARK(yuv_p_to_bgr); + +static void yuv_p_to_rgba(benchmark::State& state) { + yuv_p<4>(kleidicv_yuv_p_to_rgba_u8, state); +} +BENCHMARK(yuv_p_to_rgba); + +static void yuv_p_to_bgra(benchmark::State& state) { + yuv_p<4>(kleidicv_yuv_p_to_bgra_u8, state); +} +BENCHMARK(yuv_p_to_bgra); + template static void morphology(Function f, benchmark::State& state) { kleidicv_morphology_context_t* context = nullptr; diff --git a/conformity/opencv/test_cvtcolor.cpp b/conformity/opencv/test_cvtcolor.cpp index 53425f39c..232aa28a7 100644 --- a/conformity/opencv/test_cvtcolor.cpp +++ b/conformity/opencv/test_cvtcolor.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 @@ -33,7 +33,8 @@ bool test_cvtcolor(int index, RecreatedMessageQueue& request_queue, return false; }; - // OpenCV only accepts two-plane images with an even number of columns & rows. + // OpenCV only accepts images with an even number of columns & rows to some + // YUV formats like YUV420. for (size_t x = 4; x <= 16; x += 2) { for (size_t y = 2; y <= 16; y += 2) { if (check(x, y)) { @@ -58,6 +59,14 @@ bool test_cvtcolor(int index, RecreatedMessageQueue& request_queue, std::vector& cvtcolor_tests_get() { // clang-format off static std::vector tests = { + CVTCOLOR_TEST(YUV2BGR_YV12), + CVTCOLOR_TEST(YUV2BGRA_YV12), + CVTCOLOR_TEST(YUV2RGB_YV12), + CVTCOLOR_TEST(YUV2RGBA_YV12), + CVTCOLOR_TEST(YUV2BGR_IYUV), + CVTCOLOR_TEST(YUV2BGRA_IYUV), + CVTCOLOR_TEST(YUV2RGB_IYUV), + CVTCOLOR_TEST(YUV2RGBA_IYUV), CVTCOLOR_TEST(YUV2BGR_NV12), CVTCOLOR_TEST(YUV2RGB_NV12), CVTCOLOR_TEST(YUV2BGRA_NV12), diff --git a/doc/functionality.md b/doc/functionality.md index 43ae983c8..765af9c1e 100644 --- a/doc/functionality.md +++ b/doc/functionality.md @@ -29,26 +29,30 @@ See `doc/opencv.md` for details of the functionality available in OpenCV. | Bitwise And | x | ## Color conversions -| | u8 | -|--------------|-----| -| Gray-RGB | x | -| Gray-RGBA | x | -| RGB-BGR | x | -| BGR-RGB | x | -| RGBA-BGRA | x | -| BGRA-RGBA | x | -| YUV420-BGR | x | -| YUV420-BGRA | x | -| YUV420-RGB | x | -| YUV420-RGBA | x | -| YUV-BGR | x | -| YUV-RGB | x | -| YUV-BGRA | x | -| YUV-RGBA | x | -| RGB-YUV | x | -| RGBA-YUV | x | -| BGR-YUV | x | -| BGRA-YUV | x | +| | u8 | +|------------------------------|-----| +| Gray-RGB | x | +| Gray-RGBA | x | +| RGB-BGR | x | +| BGR-RGB | x | +| RGBA-BGRA | x | +| BGRA-RGBA | x | +| YUV420 (planar) - BGR | x | +| YUV420 (planar) - BGRA | x | +| YUV420 (planar) - RGB | x | +| YUV420 (planar) - RGBA | x | +| YUV420 (semi-planar) - BGR | x | +| YUV420 (semi-planar) - BGRA | x | +| YUV420 (semi-planar) - RGB | x | +| YUV420 (semi-planar) - RGBA | x | +| YUV-BGR | x | +| YUV-RGB | x | +| YUV-BGRA | x | +| YUV-RGBA | x | +| RGB-YUV | x | +| RGBA-YUV | x | +| BGR-YUV | x | +| BGRA-YUV | x | ## Data type conversions | | u8 | s8 | f32 | diff --git a/doc/opencv.md b/doc/opencv.md index 6d0021177..e5303182a 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -67,12 +67,13 @@ Notes on parameters: * `src.channels()` - supports 3 for RGB and 4 for RGBA. * `dst.channels()` - supports 3 for RGB and 4 for RGBA. -#### [`COLOR_YUV2RGB_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a35687717fabb536c1e1ec0857714aaf9),[`COLOR_YUV2BGR_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a305e5da3816c78b3d1ffa0498424e94f),[`COLOR_YUV2RGBA_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a18346327c937bca2aa2856914ff11507),[`COLOR_YUV2BGRA_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a0ffa81c19231ddd2e9cee8616a3a4673) -YUV420 to RGB/RGBA image conversion (semi-planar). Function accepts Y plane and UV planes separately.\ -All supported permutations are listed in the table below. -| | RGB | BGR | RGBA | BGRA | -|---|-----|-----|------|------| -|YUV| x | x | x | x | +#### [`COLOR_YUV2RGB_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a35687717fabb536c1e1ec0857714aaf9:~:text=V%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_YUV2RGB_I420,-Python%3A%20cv.COLOR_YUV2RGB_I420), [`COLOR_YUV2RGB_NV12`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a35687717fabb536c1e1ec0857714aaf9:~:text=Python%3A%20cv.COLOR_YUV2RGB-,COLOR_YUV2RGB_NV12,-Python%3A%20cv.COLOR_YUV2RGB_NV12) +YUV420 to RGB/RGBA image conversion supporting **both planar (I420/YV12)** and **semi-planar (NV12/NV21)** layouts. +All supported permutations are shown below: +| YUV Layout | RGB | BGR | RGBA | BGRA | +|---------------|-----|-----|------|------| +| Planar | x | x | x | x | +| Semi-planar | x | x | x | x | Notes on parameters: * `dst.channels()` - supports 3 for RGB and 4 for RGBA. diff --git a/kleidicv/include/kleidicv/conversions/yuv_sp_to_rgb.h b/kleidicv/include/kleidicv/conversions/yuv_420_to_rgb.h similarity index 51% rename from kleidicv/include/kleidicv/conversions/yuv_sp_to_rgb.h rename to kleidicv/include/kleidicv/conversions/yuv_420_to_rgb.h index f9dda4952..6e2c10c0f 100644 --- a/kleidicv/include/kleidicv/conversions/yuv_sp_to_rgb.h +++ b/kleidicv/include/kleidicv/conversions/yuv_420_to_rgb.h @@ -2,11 +2,46 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef KLEIDICV_CONVERSIONS_YUV_SP_TO_RGB_H -#define KLEIDICV_CONVERSIONS_YUV_SP_TO_RGB_H +#ifndef KLEIDICV_CONVERSIONS_YUV_420_TO_RGB_H +#define KLEIDICV_CONVERSIONS_YUV_420_TO_RGB_H #include "kleidicv/kleidicv.h" +extern "C" { + +// For internal use only. See instead kleidicv_yuv_p_to_rgb_u8. +// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12) +// to RGB format. The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_rgb_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + size_t begin, size_t end); + +// For internal use only. See instead kleidicv_yuv_p_to_rgba_u8. +// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12) +// to RGBA format. The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_rgba_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + size_t begin, size_t end); + +// For internal use only. See instead kleidicv_yuv_p_to_bgr_u8. +// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12) +// to BGR format. The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_bgr_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + size_t begin, size_t end); + +// For internal use only. See instead kleidicv_yuv_p_to_bgra_u8. +// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12) +// to BGRA format. The stripe is defined by the range [begin, end]. +KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_bgra_stripe_u8, const uint8_t *src, + size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + size_t begin, size_t end); +} + namespace kleidicv { /* Analog YUV to RGB conversion according to ITU-R BT.601-7 in matrix form: @@ -106,6 +141,28 @@ kleidicv_error_t yuv_sp_to_bgra_u8(const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21); + +kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, size_t end); + +kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end); + +kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, size_t end); + +kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end); } // namespace neon namespace sve2 { @@ -128,6 +185,28 @@ kleidicv_error_t yuv_sp_to_bgra_u8(const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21); + +kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, size_t end); + +kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end); + +kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, size_t end); + +kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end); } // namespace sve2 namespace sme { @@ -150,8 +229,30 @@ kleidicv_error_t yuv_sp_to_bgra_u8(const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21); + +kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, size_t end); + +kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end); + +kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, size_t end); + +kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end); } // namespace sme } // namespace kleidicv -#endif // KLEIDICV_CONVERSIONS_YUV_SP_TO_RGB_H +#endif // KLEIDICV_CONVERSIONS_YUV_420_TO_RGB_H diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h index 09761af88..cdb2e0160 100644 --- a/kleidicv/include/kleidicv/kleidicv.h +++ b/kleidicv/include/kleidicv/kleidicv.h @@ -721,6 +721,63 @@ KLEIDICV_API_DECLARATION(kleidicv_yuv_sp_to_bgra_u8, const uint8_t *src_y, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21); +#define KLEIDICV_OP_YUVP_TO_RGB(name) \ + kleidicv_error_t name(const uint8_t *src, size_t src_stride, uint8_t *dst, \ + size_t dst_stride, size_t width, size_t height, \ + bool v_first) + +/// Converts a planar YUV420 image (I420 or YV12 layout) to RGB, RGBA, BGR, or +/// BGRA format. All channels are 8-bit wide. If the output format includes an +/// alpha channel, the alpha value is set to 0xFF. +/// +/// ### Source format: Planar YUV420 +/// The input buffer consists of three planes stored sequentially in memory: +/// - Y plane: full resolution, size = width × height +/// - U plane: quarter resolution, size = (width / 2) × (height / 2) +/// - V plane: quarter resolution, size = (width / 2) × (height / 2) +/// +/// ### Destination format +/// Destination data uses interleaved pixel layout with 3 or 4 channels per +/// pixel: +/// - R, G, B +/// - B, G, R +/// - R, G, B, Alpha +/// - B, G, R, Alpha +/// +/// One pixel occupies 3 or 4 bytes, depending on the format. +/// +/// Width and height refer to the logical image dimensions, i.e., number of +/// pixels per row and number of rows. The total number of pixels must not +/// exceed @ref KLEIDICV_MAX_IMAGE_PIXELS. +/// +/// @param src Pointer to the source buffer containing Y + U + V or Y + +/// V + U in sequential planes. +/// Must be non-null. +/// @param src_stride Stride (in bytes) between rows in the Y plane. +/// Must be at least `width`. This same stride is reused to +/// compute row access in the U and V planes, which follow +/// the Y plane in memory. In such memory layouts +/// (e.g., OpenCV’s I420/YV12), the U and V planes are +/// located directly after the Y plane, and their row +/// stepping can be expressed as: `uvsteps[2] = { width / 2, +/// src_stride - width / 2 }`. This ensures correct row +/// traversal across the subsampled chroma planes. +/// @param dst Pointer to the destination data. Must be non-null. +/// @param dst_stride Byte offset between the start of one destination row and +/// the next. Must be at least `(destination channel count) * +/// width`, unless the image has only one row. +/// @param width Number of pixels in a row. +/// @param height Number of rows in the data. +/// @param v_first If true, treat the layout as YV12 (Y + V + U). Otherwise, +/// I420 (Y + U + V). +KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_rgb_u8); +/// @copydoc kleidicv_yuv_p_to_rgb_u8 +KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_rgba_u8); +/// @copydoc kleidicv_yuv_p_to_rgb_u8 +KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_bgr_u8); +/// @copydoc kleidicv_yuv_p_to_rgb_u8 +KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_bgra_u8); + /// Converts a YUV image to RGB or RGBA, pixel by pixel. All channels are 8-bit /// wide. /// diff --git a/kleidicv/src/conversions/yuv420_to_rgb_neon.h b/kleidicv/src/conversions/yuv420_to_rgb_neon.h new file mode 100644 index 000000000..685cd8cd4 --- /dev/null +++ b/kleidicv/src/conversions/yuv420_to_rgb_neon.h @@ -0,0 +1,260 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_YUV420_TO_RGB_NEON_H +#define KLEIDICV_YUV420_TO_RGB_NEON_H + +#include + +#include +#include + +#include "kleidicv/kleidicv.h" +#include "kleidicv/traits.h" + +namespace kleidicv::neon { + +template +class YUV420XToRGBxOrBGRx { + public: + using ScalarType = uint8_t; + using VectorType = uint8x16_t; + + int32x4_t y_weight_; + int32x2x2_t uv_weights_; + int32x4_t r_base_, g_base_, b_base_; + int8x16x4_t de_interleave_indices_; + const bool v_first_; + + // Returns the number of channels in the output image. + static constexpr size_t output_channels() { + return ALPHA ? /* RGBA */ 4 : /* RGB */ 3; + } + + static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { + return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)), + vmovn_s32(vshrq_n_s32(b, kWeightScale))); + } + + // clang-format off + + static constexpr int8_t kDeInterleaveTableIndices[64] = { + /* low and even */ + 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1, + /* high and even */ + 8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1, + /* low and odd */ + 1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1, + /* high and odd */ + 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1, + }; + + // clang-format on + + explicit YUV420XToRGBxOrBGRx(bool v_first) + : y_weight_{vdupq_n_s32(kYWeight)}, + uv_weights_{vld2_s32(kUVWeights)}, + r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - + 128 * kUVWeights[kRVWeightIndex])}, + g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - + 128 * (kUVWeights[1] + kUVWeights[2]))}, + b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])}, + de_interleave_indices_{}, + v_first_{v_first} { + neon::VecTraits::load(kDeInterleaveTableIndices, + de_interleave_indices_); + } + + void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l, + int32x4_t u_h, int32x4_t v_l, int32x4_t v_h, + ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { + // Y' = saturating(Ya - 16) and widen to 32-bits. + uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16)); + uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16)); + + uint32x4_t y0_m16_even_l = + vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0])); + uint32x4_t y0_m16_even_h = + vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1])); + uint32x4_t y0_m16_odd_l = + vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2])); + uint32x4_t y0_m16_odd_h = + vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3])); + + uint32x4_t y1_m16_even_l = + vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0])); + uint32x4_t y1_m16_even_h = + vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1])); + uint32x4_t y1_m16_odd_l = + vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2])); + uint32x4_t y1_m16_odd_h = + vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3])); + + // Y = Weight(Y) * Y' + y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_); + y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_); + y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_); + y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_); + + y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_); + y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_); + y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_); + y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_); + + // Swap U and V planes for YV12 layout. + if (v_first_) { + std::swap(u_l, v_l); + std::swap(u_h, v_h); + } + + // R - Y = Rbase + Weight(RV) * V = + // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V + int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0); + int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0); + + // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = + // Weight(GU) * ((1 << (SCALE - 1)) - 128) + + // Weight(GV) * ((1 << (SCALE - 1)) - 128) + + // Weight(GU) * U + Weight(GV) * V + int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0); + int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0); + g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1); + g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1); + + // B - Y = Bbase + Weight(BU) * U = + // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U + int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1); + int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1); + + // R = (R - Y) + Y + int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l); + int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h); + int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l); + int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h); + int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h); + int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h); + + int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l); + int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h); + int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l); + int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h); + int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h); + int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h); + + // G = (G - Y) + Y + int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l); + int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h); + int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l); + int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h); + int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h); + int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h); + + int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l); + int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h); + int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l); + int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h); + int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h); + int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h); + + // B = (B - Y) + Y + int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l); + int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h); + int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l); + int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h); + int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h); + int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h); + + int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l); + int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h); + int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l); + int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h); + int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h); + int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h); + + // Zip even and odd RGB pixels. + uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); + uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); + uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); + uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); + uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); + uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); + + if constexpr (ALPHA) { + uint8x16x4_t rgba0, rgba1; + // Red channel + rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); + rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); + // Green channel + rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); + rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); + // Blue channel + rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); + rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); + // Alpha channel + rgba0.val[3] = vdupq_n_u8(0xFF); + rgba1.val[3] = vdupq_n_u8(0xFF); + + if constexpr (BGR) { + std::swap(rgba0.val[0], rgba0.val[2]); + std::swap(rgba1.val[0], rgba1.val[2]); + } + + // Store RGB pixels to memory. + vst4q_u8(rgbx_row_0, rgba0); + vst4q_u8(rgbx_row_1, rgba1); + } else { + uint8x16x3_t rgb0, rgb1; + // Red channel + rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); + rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); + // Green channel + rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); + rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); + // Blue channel + rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); + rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); + + if constexpr (BGR) { + std::swap(rgb0.val[0], rgb0.val[2]); + std::swap(rgb1.val[0], rgb1.val[2]); + } + + // Store RGB pixels to memory. + vst3q_u8(rgbx_row_0, rgb0); + vst3q_u8(rgbx_row_1, rgb1); + } + } + + void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128, + int32_t v_m128, uint8_t *rgbx_rows[2]) { + for (size_t selector = 0; selector < 2; ++selector) { + int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0); + int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128; + int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 + + kUVWeights[kGVWeightIndex] * v_m128; + int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128; + + r = rounding_shift_right(r, kWeightScale); + g = rounding_shift_right(g, kWeightScale); + b = rounding_shift_right(b, kWeightScale); + + if constexpr (BGR) { + std::swap(r, b); + } + + rgbx_rows[selector][0] = saturating_cast(r); + rgbx_rows[selector][1] = saturating_cast(g); + rgbx_rows[selector][2] = saturating_cast(b); + + if constexpr (ALPHA) { + rgbx_rows[selector][3] = 0xFF; + } + + rgbx_rows[selector] += ALPHA ? 4 : 3; + } + } +}; +} // namespace kleidicv::neon + +#endif // KLEIDICV_YUV420_TO_RGB_NEON_H diff --git a/kleidicv/src/conversions/yuv420_to_rgb_sc.h b/kleidicv/src/conversions/yuv420_to_rgb_sc.h new file mode 100644 index 000000000..af7f3c552 --- /dev/null +++ b/kleidicv/src/conversions/yuv420_to_rgb_sc.h @@ -0,0 +1,158 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_YUV420_TO_RGB_SC_H +#define KLEIDICV_YUV420_TO_RGB_SC_H + +#include "kleidicv/kleidicv.h" +#include "kleidicv/sve2.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class YUV420XToRGBxOrBGRx { + public: + const bool v_first_; + explicit YUV420XToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING + : v_first_{v_first} {} + void yuv420x_to_rgb(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u, + svint16_t &v, uint8_t *rgbx_row_0, + uint8_t *rgbx_row_1) KLEIDICV_STREAMING { + // Both the rounding shift right constant and the -128 value are included. + constexpr int32_t kOffset = 1 << (kWeightScale - 1); + svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]); + svint32_t g_base = + svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2])); + svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]); + + // Y' = saturating(Ya - 16) and widen to signed 32-bits. + svuint8_t y0_m16 = svqsub(y0, static_cast(16)); + svuint16_t y0_m16_b = svmovlb(y0_m16); // 'b' means bottom + svuint16_t y0_m16_t = svmovlt(y0_m16); // 't' means top + svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b)); + svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b)); + svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t)); + svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t)); + + svuint8_t y1_m16 = svqsub(y1, static_cast(16)); + svuint16_t y1_m16_b = svmovlb(y1_m16); + svuint16_t y1_m16_t = svmovlt(y1_m16); + svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b)); + svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b)); + svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t)); + svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t)); + + // Y = Weight(Y) * Y' + y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight); + y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight); + y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight); + y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight); + + y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight); + y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight); + y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight); + y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight); + + // Swap U and V planes for YV12 layout. + if (v_first_) { + swap_scalable(u, v); + } + + svint32_t u_b = svmovlb(u); + svint32_t u_t = svmovlt(u); + svint32_t v_b = svmovlb(v); + svint32_t v_t = svmovlt(v); + + // R - Y = Rbase + Weight(RV) * V = + // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V + svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]); + svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]); + + // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = + // Weight(GU) * ((1 << (SCALE - 1)) - 128) + + // Weight(GV) * ((1 << (SCALE - 1)) - 128) + + // Weight(GU) * U + Weight(GV) * V + svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]); + svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]); + g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]); + g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]); + + // B - Y = Bbase + Weight(BU) * U = + // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U + svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]); + svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]); + + // R = (R - Y) + Y + // FIXME: There are too many instructions here. + // Is there a better way to do this? + svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb); + r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt); + r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16); + svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb); + r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt); + r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16); + svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t); + + svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb); + r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt); + r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16); + svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb); + r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt); + r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16); + svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t); + + // G = (G - Y) + Y + svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb); + g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt); + g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16); + svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb); + g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt); + g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16); + svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t); + + svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb); + g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt); + g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16); + svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb); + g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt); + g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16); + svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t); + + // B = (B - Y) + Y + svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb); + b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt); + b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16); + svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb); + b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt); + b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16); + svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t); + + svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb); + b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt); + b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16); + svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb); + b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt); + b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16); + svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t); + + if constexpr (ALPHA) { + svuint8x4_t rgba0 = + svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF)); + svuint8x4_t rgba1 = + svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF)); + // Store RGBA pixels to memory. + svst4_u8(pg, rgbx_row_0, rgba0); + svst4_u8(pg, rgbx_row_1, rgba1); + } else { + svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0); + svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1); + // Store RGB pixels to memory. + svst3(pg, rgbx_row_0, rgb0); + svst3(pg, rgbx_row_1, rgb1); + } + } +}; +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_YUV420_TO_RGB_SC_H diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_api.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_api.cpp new file mode 100644 index 000000000..c467a3628 --- /dev/null +++ b/kleidicv/src/conversions/yuv_p_to_rgb_api.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "kleidicv/conversions/yuv_420_to_rgb.h" +#include "kleidicv/dispatch.h" +#include "kleidicv/kleidicv.h" + +#define KLEIDICV_DEFINE_C_API(name, partialname) \ + KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::partialname, \ + &kleidicv::sve2::partialname, \ + &kleidicv::sme::partialname, nullptr) + +KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_rgb_stripe_u8, yuv_p_to_rgb_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_bgr_stripe_u8, yuv_p_to_bgr_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_rgba_stripe_u8, + yuv_p_to_rgba_stripe_u8); + +KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_bgra_stripe_u8, + yuv_p_to_bgra_stripe_u8); + +extern "C" { + +kleidicv_error_t kleidicv_yuv_p_to_rgb_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first) { + return kleidicv_yuv_p_to_rgb_stripe_u8(src, src_stride, dst, dst_stride, + width, height, v_first, 0, height); +} + +kleidicv_error_t kleidicv_yuv_p_to_bgr_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first) { + return kleidicv_yuv_p_to_bgr_stripe_u8(src, src_stride, dst, dst_stride, + width, height, v_first, 0, height); +} + +kleidicv_error_t kleidicv_yuv_p_to_rgba_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool v_first) { + return kleidicv_yuv_p_to_rgba_stripe_u8(src, src_stride, dst, dst_stride, + width, height, v_first, 0, height); +} + +kleidicv_error_t kleidicv_yuv_p_to_bgra_u8(const uint8_t *src, + size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, + size_t height, bool v_first) { + return kleidicv_yuv_p_to_bgra_stripe_u8(src, src_stride, dst, dst_stride, + width, height, v_first, 0, height); +} + +} // extern "C" diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp new file mode 100644 index 000000000..d37d7e873 --- /dev/null +++ b/kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp @@ -0,0 +1,248 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "kleidicv/conversions/yuv_420_to_rgb.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/neon.h" +#include "yuv420_to_rgb_neon.h" + +namespace kleidicv::neon { +template +class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx, + public UnrollOnce, + public TryToAvoidTailLoop { + public: + using VecTraits = neon::VecTraits; + using ScalarType = VecTraits::ScalarType; + using VectorType = VecTraits::VectorType; + using YUV420XToRGBxOrBGRx::de_interleave_indices_; + using YUV420XToRGBxOrBGRx::yuv420x_to_rgb; + using YUV420XToRGBxOrBGRx::v_first_; + + explicit YUVpToRGBxOrBGRx(bool v_first) + : YUV420XToRGBxOrBGRx(v_first) {} + + void vector_path(VectorType &y0, VectorType &y1, VectorType &y2, + VectorType &y3, VectorType &u, VectorType &v, + ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { + // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore) + // These are needed to expand each group of 4 bytes into a full 32-bit lane + uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, + 2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff}; + + uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff, + 6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff}; + + uint8x16_t index_hi_lo = {8, 0xff, 0xff, 0xff, 9, 0xff, 0xff, 0xff, + 10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff}; + + uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff, + 14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff}; + + // Expand each 8-bit channel into 32-bit vectors using table lookup and + // reinterpret + int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo)); + int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi)); + int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo)); + int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi)); + + int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo)); + int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi)); + int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo)); + int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi)); + + constexpr size_t step = ALPHA ? 4 * 16 : 3 * 16; + + yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0, + rgbx_row_1); + + yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi, + rgbx_row_0 + step, rgbx_row_1 + step); + } + + // Processes inputs which are not long enough to fit a vector. + void scalar_path(size_t length, const ScalarType *y_row_0, + const ScalarType *y_row_1, const ScalarType *u_row, + const ScalarType *v_row, ScalarType *rgbx_row_0, + ScalarType *rgbx_row_1) { + const uint8_t *y_rows[2] = {y_row_0, y_row_1}; + uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1}; + + int32_t u_m128 = 0, v_m128 = 0; + for (size_t index = 0; index < length; ++index) { + disable_loop_vectorization(); + + // There is one {U, V} pair for 4 Y values. + if ((index % 2) == 0) { + u_m128 = u_row[0] - 128; + v_m128 = v_row[0] - 128; + u_row += 1; + v_row += 1; + if (v_first_) { + std::swap(u_m128, v_m128); + } + } + + yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows); + } + } +}; // end of class YUVpToRGBxOrBGRx + +using YUVpToRGB = YUVpToRGBxOrBGRx; +using YUVpToRGBA = YUVpToRGBxOrBGRx; +using YUVpToBGR = YUVpToRGBxOrBGRx; +using YUVpToBGRA = YUVpToRGBxOrBGRx; + +template +kleidicv_error_t yuv2rgbx_operation(OperationType &operation, + const ScalarType *src, size_t src_stride, + ScalarType *dst, size_t dst_stride, + size_t width, size_t height, size_t begin, + size_t end) { + CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + + // Pointer to the start of the U plane. + // Since `src` points to a planar YUV buffer, the Y plane comes first, + // occupying `src_stride * height` bytes. + const ScalarType *u = src + src_stride * height; + // Pointer to the start of the V plane. + // The V plane follows the U plane. Both U and V planes are + // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and + // are often stored in a single contiguous chroma region in memory. Depending + // on image height and stride, the starting offset of V may require adjustment + // to maintain correct alignment. In particular, when the image height is not + // divisible evenly by 4, the chroma rows may not align perfectly, so a + // fractional offset (in rows) is applied to calculate the V plane position. + // The formula used here accounts for this by adjusting based on row parity, + // assuming consistent memory layout across the Y, U, and V planes. + const ScalarType *v = + u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); + + // These indices control how U and V row strides are selected across the image + // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to + // two luma (Y) rows. However, when the image height is not divisible by 4, + // the mapping between chroma and luma rows becomes asymmetric. Specifically, + // when `height % 4 == 2`, the start of the V plane is offset by one chroma + // row relative to U. + // + // This results in U and V rows being interleaved with a phase difference, + // which must be accounted for during row-wise traversal. To handle this, + // `u_index` and `v_index` are used to alternate the stride selection + // independently for U and V across the loop. + // + // This mechanism ensures that memory access patterns remain correct, + // especially in layouts where U and V share a contiguous buffer with + // alternating strides. Offsetting `v_index` allows the traversal logic to + // maintain correct alignment and prevents misaligned or incorrect reads from + // the chroma buffer. + size_t u_index = 0; + size_t v_index = height % 4 == 2 ? 1 : 0; + + // Compute the actual row range in the Y plane (full resolution). + // Since each UV row maps to 2 Y rows, we double the begin/end indices. + size_t row_begin = begin * 2; + size_t row_end = std::min(height, end * 2); + size_t row_uv = begin; + + // UV stepping pattern: first half of row, then padded second half. + // Needed to match row strides between chroma and luma components. + size_t uv_strides[2] = {width / 2, src_stride - width / 2}; + + // Calculate starting pointers for Y, U, and V planes at the given stripe + // start. + const ScalarType *y0 = src + row_begin * src_stride; + u = u + row_uv * src_stride / 2; + v = v + row_uv * src_stride / 2; + + size_t dcn = operation.output_channels(); + const size_t vsize = 16; + for (size_t h = row_begin; h < row_end; h += 2) { + ScalarType *row0 = dst + dst_stride * h; + ScalarType *row1 = dst + dst_stride * (h + 1); + const ScalarType *y1 = y0 + src_stride; + + // Guard for odd-height images. + // If the last row in the stripe is unpaired (odd number of rows), + // reuse the previous row pointers to avoid out-of-bounds access. + if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { + row1 = row0; + y1 = y0; + } + + LoopUnroll2 loop{width, vsize}; + + loop.unroll_twice([&](size_t index) { + uint8x16_t u_vec = vld1q_u8(u + (index >> 1)); + uint8x16_t v_vec = vld1q_u8(v + (index >> 1)); + uint8x16_t y0_vec = vld1q_u8(y0 + index); + uint8x16_t y1_vec = vld1q_u8(y1 + index); + uint8x16_t y2_vec = vld1q_u8(y0 + index + 16); + uint8x16_t y3_vec = vld1q_u8(y1 + index + 16); + + operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec, + &row0[index * dcn], &row1[index * dcn]); + }); + + loop.remaining([&](size_t index, size_t length) { + operation.scalar_path(length - index, y0 + index, y1 + index, + u + (index >> 1), v + (index >> 1), + &row0[index * dcn], &row1[index * dcn]); + }); + + y0 += src_stride * 2; + u += uv_strides[(u_index++) & 1]; + v += uv_strides[(v_index++) & 1]; + } + + return KLEIDICV_OK; +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end) { + YUVpToRGB operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end) { + YUVpToRGBA operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end) { + YUVpToBGR operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end) { + YUVpToBGRA operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} +} // namespace kleidicv::neon diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_sc.h b/kleidicv/src/conversions/yuv_p_to_rgb_sc.h new file mode 100644 index 000000000..2395a0c45 --- /dev/null +++ b/kleidicv/src/conversions/yuv_p_to_rgb_sc.h @@ -0,0 +1,231 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_YUV_P_TO_RGB_SC_H +#define KLEIDICV_YUV_P_TO_RGB_SC_H + +#include + +#include "kleidicv/conversions/yuv_420_to_rgb.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/sve2.h" +#include "yuv420_to_rgb_sc.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx { + public: + using YUV420XToRGBxOrBGRx::yuv420x_to_rgb; + + explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING + : YUV420XToRGBxOrBGRx(v_first) {} + + // Returns the number of channels in the output image. + static constexpr size_t output_channels() KLEIDICV_STREAMING { + return ALPHA ? /* RGBA */ 4 : /* RGB */ 3; + } + + // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and + // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration. + void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u, + svint16_t &v, uint8_t *rgbx_row_0, + uint8_t *rgbx_row_1) KLEIDICV_STREAMING { + yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1); + } +}; // end of class YUVpToRGBxOrBGRx + +using YUVpToRGB = YUVpToRGBxOrBGRx; +using YUVpToRGBA = YUVpToRGBxOrBGRx; +using YUVpToBGR = YUVpToRGBxOrBGRx; +using YUVpToBGRA = YUVpToRGBxOrBGRx; + +template +kleidicv_error_t yuv2rgbx_operation(OperationType &operation, + const ScalarType *src, size_t src_stride, + ScalarType *dst, size_t dst_stride, + size_t width, size_t height, size_t begin, + size_t end) KLEIDICV_STREAMING { + CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + + // Pointer to the start of the U plane. + // Since `src` points to a planar YUV buffer, the Y plane comes first, + // occupying `src_stride * height` bytes. + const ScalarType *u = src + src_stride * height; + // Pointer to the start of the V plane. + // The V plane follows the U plane. Both U and V planes are + // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and + // are often stored in a single contiguous chroma region in memory. Depending + // on image height and stride, the starting offset of V may require adjustment + // to maintain correct alignment. In particular, when the image height is not + // divisible evenly by 4, the chroma rows may not align perfectly, so a + // fractional offset (in rows) is applied to calculate the V plane position. + // The formula used here accounts for this by adjusting based on row parity, + // assuming consistent memory layout across the Y, U, and V planes. + const ScalarType *v = + u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2); + + // These indices control how U and V row strides are selected across the image + // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to + // two luma (Y) rows. However, when the image height is not divisible by 4, + // the mapping between chroma and luma rows becomes asymmetric. Specifically, + // when `height % 4 == 2`, the start of the V plane is offset by one chroma + // row relative to U. + // + // This results in U and V rows being interleaved with a phase difference, + // which must be accounted for during row-wise traversal. To handle this, + // `u_index` and `v_index` are used to alternate the stride selection + // independently for U and V across the loop. + // + // This mechanism ensures that memory access patterns remain correct, + // especially in layouts where U and V share a contiguous buffer with + // alternating strides. Offsetting `v_index` allows the traversal logic to + // maintain correct alignment and prevents misaligned or incorrect reads from + // the chroma buffer. + size_t u_index = 0; + size_t v_index = height % 4 == 2 ? 1 : 0; + + // Compute the actual row range in the Y plane (full resolution). + // Since each UV row maps to 2 Y rows, we double the begin/end indices. + size_t row_begin = begin * 2; + size_t row_end = std::min(height, end * 2); + size_t row_uv = begin; + + // UV stepping pattern: first half of row, then padded second half. + // Needed to match row strides between chroma and luma components. + size_t uv_strides[2] = {width / 2, src_stride - width / 2}; + + // Calculate starting pointers for Y, U, and V planes at the given stripe + // start. + const ScalarType *y0 = src + row_begin * src_stride; + u = u + row_uv * src_stride / 2; + v = v + row_uv * src_stride / 2; + + size_t dcn = operation.output_channels(); + const size_t vsize = svcntb(); + for (size_t h = row_begin; h < row_end; h += 2) { + ScalarType *row0 = dst + dst_stride * h; + ScalarType *row1 = dst + dst_stride * (h + 1); + const ScalarType *y1 = y0 + src_stride; + + // Guard for odd-height images. + // If the last row in the stripe is unpaired (odd number of rows), + // reuse the previous row pointers to avoid out-of-bounds access. + if (KLEIDICV_UNLIKELY(h == (row_end - 1))) { + row1 = row0; + y1 = y0; + } + + LoopUnroll2 loop{width, svcntb()}; + + loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { + svbool_t pg = svptrue_b8(); + svuint8_t u8_vec = svld1(pg, u + (index >> 1)); + svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); + svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec)); + + svuint8_t v8_vec = svld1(pg, v + (index >> 1)); + svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); + svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec)); + + svuint8_t y0_vec = svld1(pg, y0 + index); + svuint8_t y1_vec = svld1(pg, y1 + index); + svuint8_t y2_vec = svld1(pg, y0 + index + vsize); + svuint8_t y3_vec = svld1(pg, y1 + index + vsize); + + operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, + &row0[index * dcn], &row1[index * dcn]); + + operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi, + &row0[(index + vsize) * dcn], + &row1[(index + vsize) * dcn]); + }); + + loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { + svbool_t pg = svptrue_b8(); + svbool_t pg_half = svptrue_b8(); + svuint8_t u8_vec = svld1(pg_half, u + (index >> 1)); + svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); + + svuint8_t v8_vec = svld1(pg, v + (index >> 1)); + svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); + + svuint8_t y0_vec = svld1(pg, y0 + index); + svuint8_t y1_vec = svld1(pg, y1 + index); + + operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, + &row0[index * dcn], &row1[index * dcn]); + }); + + loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { + size_t min_width = length - index; + size_t half_min_width = (min_width + 1) / 2; + svbool_t pg = svwhilelt_b8(int64_t(0), static_cast(min_width)); + svbool_t pg_half = + svwhilelt_b8(int64_t(0), static_cast(half_min_width)); + svuint8_t u8_vec = svld1(pg_half, u + (index >> 1)); + svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec)); + + svuint8_t v8_vec = svld1(pg_half, v + (index >> 1)); + svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec)); + + svuint8_t y0_vec = svld1(pg, y0 + index); + svuint8_t y1_vec = svld1(pg, y1 + index); + + operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo, + &row0[index * dcn], &row1[index * dcn]); + }); + + y0 += src_stride * 2; + u += uv_strides[(u_index++) & 1]; + v += uv_strides[(v_index++) & 1]; + } + + return KLEIDICV_OK; +} + +KLEIDICV_TARGET_FN_ATTRS +static kleidicv_error_t yuv_p_to_rgb_stripe_u8_sc( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, size_t begin, + size_t end) KLEIDICV_STREAMING { + YUVpToRGB operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +static kleidicv_error_t yuv_p_to_rgba_stripe_u8_sc( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, size_t begin, + size_t end) KLEIDICV_STREAMING { + YUVpToRGBA operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +static kleidicv_error_t yuv_p_to_bgr_stripe_u8_sc( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, size_t begin, + size_t end) KLEIDICV_STREAMING { + YUVpToBGR operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +static kleidicv_error_t yuv_p_to_bgra_stripe_u8_sc( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, size_t begin, + size_t end) KLEIDICV_STREAMING { + YUVpToBGRA operation{v_first}; + return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width, + height, begin, end); +} +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_YUV_P_TO_RGB_SC_H diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp new file mode 100644 index 000000000..436191745 --- /dev/null +++ b/kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "yuv_p_to_rgb_sc.h" + +namespace kleidicv::sme { + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + bool v_first, size_t begin, size_t end) { + return yuv_p_to_rgb_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + bool v_first, size_t begin, size_t end) { + return yuv_p_to_rgba_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + bool v_first, size_t begin, size_t end) { + return yuv_p_to_bgr_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + bool v_first, size_t begin, size_t end) { + return yuv_p_to_bgra_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} +} // namespace kleidicv::sme diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp new file mode 100644 index 000000000..0bebc99e4 --- /dev/null +++ b/kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "yuv_p_to_rgb_sc.h" + +namespace kleidicv::sve2 { + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end) { + return yuv_p_to_rgb_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, + size_t width, size_t height, + bool v_first, size_t begin, + size_t end) { + return yuv_p_to_rgba_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t yuv_p_to_bgr_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, size_t begin, size_t end) { + return yuv_p_to_bgr_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t yuv_p_to_bgra_stripe_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, size_t begin, size_t end) { + return yuv_p_to_bgra_stripe_u8_sc(src, src_stride, dst, dst_stride, width, + height, v_first, begin, end); +} + +} // namespace kleidicv::sve2 diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp b/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp index 098f5ce87..e790f74ac 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "kleidicv/conversions/yuv_sp_to_rgb.h" +#include "kleidicv/conversions/yuv_420_to_rgb.h" #include "kleidicv/dispatch.h" #include "kleidicv/kleidicv.h" diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp index 1deacceae..b0ff1e1ab 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp @@ -4,78 +4,31 @@ #include -#include "kleidicv/conversions/yuv_sp_to_rgb.h" +#include "kleidicv/conversions/yuv_420_to_rgb.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" +#include "yuv420_to_rgb_neon.h" namespace kleidicv::neon { - template -class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop { +class YUVSpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx, + public UnrollOnce, + public TryToAvoidTailLoop { public: using VecTraits = neon::VecTraits; using ScalarType = VecTraits::ScalarType; using VectorType = VecTraits::VectorType; + using YUV420XToRGBxOrBGRx::de_interleave_indices_; + using YUV420XToRGBxOrBGRx::yuv420x_to_rgb; + using YUV420XToRGBxOrBGRx::v_first_; - explicit YUVSpToRGBxOrBGRx(bool is_nv21) - : y_weight_{vdupq_n_s32(kYWeight)}, - uv_weights_{vld2_s32(kUVWeights)}, - // Both the rounding shift right constant and the -128 value are - // included. - r_base_{vdupq_n_s32(static_cast(1 << (kWeightScale - 1)) - - 128 * kUVWeights[kRVWeightIndex])}, - g_base_{vdupq_n_s32(static_cast(1 << (kWeightScale - 1)) - - 128 * (kUVWeights[1] + kUVWeights[2]))}, - b_base_{vdupq_n_s32(static_cast(1 << (kWeightScale - 1)) - - 128 * kUVWeights[3])}, - de_interleave_indices_{}, - is_nv21_(is_nv21) { - neon::VecTraits::load(kDeInterleaveTableIndices, - de_interleave_indices_); - } - - // Returns the number of channels in the output image. - static constexpr size_t output_channels() { - return ALPHA ? /* RGBA */ 4 : /* RGB */ 3; - } + explicit YUVSpToRGBxOrBGRx(bool v_first) + : YUV420XToRGBxOrBGRx(v_first) {} // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration. void vector_path(VectorType y0, VectorType y1, VectorType uv, ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) { - // Y' = saturating(Ya - 16) and widen to 32-bits. - uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16)); - uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16)); - - uint32x4_t y0_m16_even_l = - vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0])); - uint32x4_t y0_m16_even_h = - vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1])); - uint32x4_t y0_m16_odd_l = - vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2])); - uint32x4_t y0_m16_odd_h = - vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3])); - - uint32x4_t y1_m16_even_l = - vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0])); - uint32x4_t y1_m16_even_h = - vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1])); - uint32x4_t y1_m16_odd_l = - vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2])); - uint32x4_t y1_m16_odd_h = - vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3])); - - // Y = Weight(Y) * Y' - y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_); - y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_); - y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_); - y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_); - - y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_); - y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_); - y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_); - y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_); - // Widen U and V to 32 bits. int32x4_t u_l = vqtbl1q_s8(uv, de_interleave_indices_.val[0]); int32x4_t u_h = vqtbl1q_s8(uv, de_interleave_indices_.val[1]); @@ -83,128 +36,7 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop { int32x4_t v_l = vqtbl1q_s8(uv, de_interleave_indices_.val[2]); int32x4_t v_h = vqtbl1q_s8(uv, de_interleave_indices_.val[3]); - // Swap U and V channels for NV21 (order is V, U). - if (is_nv21_) { - std::swap(u_l, v_l); - std::swap(u_h, v_h); - } - - // R - Y = Rbase + Weight(RV) * V = - // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V - int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0); - int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0); - - // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = - // Weight(GU) * ((1 << (SCALE - 1)) - 128) + - // Weight(GV) * ((1 << (SCALE - 1)) - 128) + - // Weight(GU) * U + Weight(GV) * V - int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0); - int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0); - g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1); - g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1); - - // B - Y = Bbase + Weight(BU) * U = - // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U - int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1); - int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1); - - // R = (R - Y) + Y - int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l); - int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h); - int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l); - int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h); - int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h); - int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h); - - int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l); - int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h); - int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l); - int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h); - int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h); - int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h); - - // G = (G - Y) + Y - int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l); - int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h); - int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l); - int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h); - int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h); - int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h); - - int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l); - int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h); - int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l); - int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h); - int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h); - int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h); - - // B = (B - Y) + Y - int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l); - int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h); - int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l); - int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h); - int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h); - int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h); - - int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l); - int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h); - int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l); - int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h); - int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h); - int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h); - - // Zip even and odd RGB pixels. - uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); - uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); - uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); - uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); - uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); - uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); - - if constexpr (ALPHA) { - uint8x16x4_t rgba0, rgba1; - // Red channel - rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); - rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); - // Green channel - rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); - rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); - // Blue channel - rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); - rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); - // Alpha channel - rgba0.val[3] = vdupq_n_u8(0xFF); - rgba1.val[3] = vdupq_n_u8(0xFF); - - if constexpr (BGR) { - std::swap(rgba0.val[0], rgba0.val[2]); - std::swap(rgba1.val[0], rgba1.val[2]); - } - - // Store RGB pixels to memory. - vst4q_u8(rgbx_row_0, rgba0); - vst4q_u8(rgbx_row_1, rgba1); - } else { - uint8x16x3_t rgb0, rgb1; - // Red channel - rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]); - rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]); - // Green channel - rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]); - rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]); - // Blue channel - rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]); - rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]); - - if constexpr (BGR) { - std::swap(rgb0.val[0], rgb0.val[2]); - std::swap(rgb1.val[0], rgb1.val[2]); - } - - // Store RGB pixels to memory. - vst3q_u8(rgbx_row_0, rgb0); - vst3q_u8(rgbx_row_1, rgb1); - } + yuv420x_to_rgb(y0, y1, u_l, u_h, v_l, v_h, rgbx_row_0, rgbx_row_1); } // Processes inputs which are not long enough to fit a vector. @@ -223,66 +55,14 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop { u_m128 = uv_row[0] - 128; v_m128 = uv_row[1] - 128; uv_row += 2; - if (is_nv21_) { + if (v_first_) { std::swap(u_m128, v_m128); } } - for (size_t selector = 0; selector < 2; ++selector) { - int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0); - int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128; - int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 + - kUVWeights[kGVWeightIndex] * v_m128; - int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128; - - r = rounding_shift_right(r, kWeightScale); - g = rounding_shift_right(g, kWeightScale); - b = rounding_shift_right(b, kWeightScale); - - if constexpr (BGR) { - std::swap(r, b); - } - - rgbx_rows[selector][0] = saturating_cast(r); - rgbx_rows[selector][1] = saturating_cast(g); - rgbx_rows[selector][2] = saturating_cast(b); - if constexpr (ALPHA) { - rgbx_rows[selector][3] = 0xFF; - } - - rgbx_rows[selector] += ALPHA ? /* RGBA */ 4 : /* RGB */ 3; - } + yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows); } } - - private: - static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) { - return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)), - vmovn_s32(vshrq_n_s32(b, kWeightScale))); - } - - int32x4_t y_weight_; - int32x2x2_t uv_weights_; - int32x4_t r_base_; - int32x4_t g_base_; - int32x4_t b_base_; - int8x16x4_t de_interleave_indices_; - - const bool is_nv21_; - // clang-format off - - static constexpr int8_t kDeInterleaveTableIndices[64] = { - /* low and even */ - 0, -1, -1, -1, 2, -1, -1, -1, 4, -1, -1, -1, 6, -1, -1, -1, - /* high and even */ - 8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1, - /* low and odd */ - 1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1, - /* high and odd */ - 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1, - }; - - // clang-format on }; // end of class YUVSpToRGBxOrBGRx using YUVSpToRGB = YUVSpToRGBxOrBGRx; @@ -296,7 +76,7 @@ kleidicv_error_t yuv2rgbx_operation( const ScalarType *src_uv, size_t src_uv_stride, ScalarType *dst, size_t dst_stride, size_t width, size_t height) { CHECK_POINTER_AND_STRIDE(src_y, src_y_stride, height); - CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, height); + CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, (height + 1) / 2); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h index 201787117..9254d8faa 100644 --- a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h +++ b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h @@ -5,20 +5,22 @@ #ifndef KLEIDICV_YUV_SP_TO_RGB_SC_H #define KLEIDICV_YUV_SP_TO_RGB_SC_H -#include "kleidicv/conversions/yuv_sp_to_rgb.h" +#include "kleidicv/conversions/yuv_420_to_rgb.h" #include "kleidicv/kleidicv.h" #include "kleidicv/sve2.h" +#include "yuv420_to_rgb_sc.h" namespace KLEIDICV_TARGET_NAMESPACE { template -class YUVSpToRGBxOrBGRx final { +class YUVSpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx { public: using ContextType = Context; using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; + using YUV420XToRGBxOrBGRx::yuv420x_to_rgb; - explicit YUVSpToRGBxOrBGRx(bool is_nv21) KLEIDICV_STREAMING - : is_nv21_(is_nv21) {} + explicit YUVSpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING + : YUV420XToRGBxOrBGRx(v_first) {} // Returns the number of channels in the output image. static constexpr size_t output_channels() KLEIDICV_STREAMING { @@ -32,152 +34,15 @@ class YUVSpToRGBxOrBGRx final { uint8_t *rgbx_row_0, uint8_t *rgbx_row_1) KLEIDICV_STREAMING { auto pg = ctx.predicate(); - - // Both the rounding shift right constant and the -128 value are included. - constexpr int32_t kOffset = 1 << (kWeightScale - 1); - svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]); - svint32_t g_base = - svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2])); - svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]); - // Load channels: y0 and y1 are two adjacent rows. svuint8_t y0 = svld1(pg, y_row_0); svuint8_t y1 = svld1(pg, y_row_1); svuint8_t uv = svld1(pg, uv_row); - - // Y' = saturating(Ya - 16) and widen to signed 32-bits. - svuint8_t y0_m16 = svqsub(y0, static_cast(16)); - svuint16_t y0_m16_b = svmovlb(y0_m16); // 'b' means bottom - svuint16_t y0_m16_t = svmovlt(y0_m16); // 't' means top - svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b)); - svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b)); - svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t)); - svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t)); - - svuint8_t y1_m16 = svqsub(y1, static_cast(16)); - svuint16_t y1_m16_b = svmovlb(y1_m16); - svuint16_t y1_m16_t = svmovlt(y1_m16); - svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b)); - svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b)); - svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t)); - svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t)); - - // Y = Weight(Y) * Y' - y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight); - y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight); - y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight); - y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight); - - y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight); - y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight); - y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight); - y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight); - // Widen U and V to 32 bits. svint16_t u = svreinterpret_s16(svmovlb(uv)); svint16_t v = svreinterpret_s16(svmovlt(uv)); - - if (is_nv21_) { - // Swap U and V channels for NV21 (order is V, U). - swap_scalable(u, v); - } - - svint32_t u_b = svmovlb(u); - svint32_t u_t = svmovlt(u); - svint32_t v_b = svmovlb(v); - svint32_t v_t = svmovlt(v); - - // R - Y = Rbase + Weight(RV) * V = - // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V - svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]); - svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]); - - // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = - // Weight(GU) * ((1 << (SCALE - 1)) - 128) + - // Weight(GV) * ((1 << (SCALE - 1)) - 128) + - // Weight(GU) * U + Weight(GV) * V - svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]); - svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]); - g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]); - g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]); - - // B - Y = Bbase + Weight(BU) * U = - // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U - svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]); - svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]); - - // R = (R - Y) + Y - // FIXME: There are too many instructions here. - // Is there a better way to do this? - svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb); - r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt); - r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16); - svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb); - r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt); - r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16); - svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t); - - svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb); - r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt); - r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16); - svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb); - r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt); - r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16); - svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t); - - // G = (G - Y) + Y - svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb); - g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt); - g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16); - svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb); - g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt); - g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16); - svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t); - - svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb); - g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt); - g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16); - svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb); - g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt); - g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16); - svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t); - - // B = (B - Y) + Y - svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb); - b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt); - b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16); - svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb); - b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt); - b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16); - svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t); - - svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb); - b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt); - b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16); - svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb); - b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt); - b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16); - svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t); - - if constexpr (ALPHA) { - svuint8x4_t rgba0 = - svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF)); - svuint8x4_t rgba1 = - svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF)); - // Store RGBA pixels to memory. - svst4_u8(pg, rgbx_row_0, rgba0); - svst4_u8(pg, rgbx_row_1, rgba1); - } else { - svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0); - svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1); - // Store RGB pixels to memory. - svst3(pg, rgbx_row_0, rgb0); - svst3(pg, rgbx_row_1, rgb1); - } + yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1); } - - private: - const bool is_nv21_; }; // end of class YUVSpToRGBxOrBGRx using YUVSpToRGB = YUVSpToRGBxOrBGRx; @@ -191,7 +56,7 @@ kleidicv_error_t yuv2rgbx_operation( const ScalarType *src_uv, size_t src_uv_stride, ScalarType *dst, size_t dst_stride, size_t width, size_t height) KLEIDICV_STREAMING { CHECK_POINTER_AND_STRIDE(src_y, src_y_stride, height); - CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, height); + CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, (height + 1) / 2); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 2a6bcebe8..df51c3068 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -76,6 +76,38 @@ KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_f32_to_u8, float, uint8_t); KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_s8_to_f32, int8_t, float); KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_u8_to_f32, uint8_t, float); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_yuv_p_to_bgr_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_yuv_p_to_bgra_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_yuv_p_to_rgb_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_yuv_p_to_rgba_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_yuv_sp_to_bgr_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index aef62654e..875091a99 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -13,6 +13,7 @@ #include "kleidicv/arithmetics/rotate.h" #include "kleidicv/arithmetics/scale.h" +#include "kleidicv/conversions/yuv_420_to_rgb.h" #include "kleidicv/ctypes.h" #include "kleidicv/filters/blur_and_downsample.h" #include "kleidicv/filters/gaussian_blur.h" @@ -250,6 +251,54 @@ kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride, return parallel_batches(callback, mt, width, 64); } +kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_yuv_p_to_bgr_stripe_u8( + src, src_stride, dst, dst_stride, width, height, v_first, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_yuv_p_to_bgra_stripe_u8( + src, src_stride, dst, dst_stride, width, height, v_first, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_yuv_p_to_rgb_stripe_u8( + src, src_stride, dst, dst_stride, width, height, v_first, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, bool v_first, + kleidicv_thread_multithreading mt) { + auto callback = [=](unsigned begin, unsigned end) { + return kleidicv_yuv_p_to_rgba_stripe_u8( + src, src_stride, dst, dst_stride, width, height, v_first, + static_cast(begin), static_cast(end)); + }; + return parallel_batches(callback, mt, (height + 1) / 2); +} + template inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl( F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index 4001aef9a..1606bc649 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -16,6 +16,10 @@ YUVSP2BGR: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2B YUVSP2BGRA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2BGRA_NV12)' YUVSP2RGB: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGB_NV12)' YUVSP2RGBA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGBA_NV12)' +YUVP2BGR: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2BGR_YV12)' +YUVP2BGRA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2BGRA_YV12)' +YUVP2RGB: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGB_YV12)' +YUVP2RGBA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGBA_YV12)' RGB2YUV: opencv_perf_imgproc '*cvtColor8u/*' '($PIXEL_FORMAT, COLOR_RGB2YUV)' BGR2YUV: opencv_perf_imgproc '*cvtColor8u/*' '($PIXEL_FORMAT, COLOR_BGR2YUV)' diff --git a/test/api/test_thread_yuv_p_to_rgb.cpp b/test/api/test_thread_yuv_p_to_rgb.cpp new file mode 100644 index 000000000..58bc02822 --- /dev/null +++ b/test/api/test_thread_yuv_p_to_rgb.cpp @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" + +// Tuple of width, height, thread count. +typedef std::tuple P; + +class YuvpThread : public testing::TestWithParam

{ + public: + template + void check(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, size_t channels) { + unsigned width = 0, height = 0, thread_count = 0; + std::tie(width, height, thread_count) = GetParam(); + test::Array2D src(width, (height * 3 + 1) / 2), + dst_single(size_t{width} * channels, height), + dst_multi(size_t{width} * channels, height); + + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + + kleidicv_error_t single_result = + single_threaded_func(src.data(), src.stride(), dst_single.data(), + dst_single.stride(), width, height, false); + + kleidicv_error_t multi_result = multithreaded_func( + src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width, + height, false, get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); + } + + template + void run_unsupported(Func impl, size_t channels, bool is_nv21) { + test::Array2D src{20, (10 * 3 + 1) / 2}; + + test::Array2D dst{20 * channels, 10, 0, channels}; + + test::test_null_args(impl, src.data(), src.stride(), dst.data(), + dst.stride(), dst.width(), dst.height(), is_nv21, + get_multithreading_fake(2)); + + EXPECT_EQ(KLEIDICV_OK, + impl(src.data(), src.stride(), dst.data(), dst.stride(), 0, 1, + is_nv21, get_multithreading_fake(2))); + + EXPECT_EQ(KLEIDICV_OK, + impl(src.data(), src.stride(), dst.data(), dst.stride(), 1, 0, + is_nv21, get_multithreading_fake(2))); + + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, is_nv21, + get_multithreading_fake(2))); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, + is_nv21, get_multithreading_fake(2))); + } +}; + +TEST_P(YuvpThread, ToBGR) { + check(kleidicv_yuv_p_to_bgr_u8, kleidicv_thread_yuv_p_to_bgr_u8, 3); +} +TEST_P(YuvpThread, ToBGRA) { + check(kleidicv_yuv_p_to_bgra_u8, kleidicv_thread_yuv_p_to_bgra_u8, 4); +} +TEST_P(YuvpThread, ToRGB) { + check(kleidicv_yuv_p_to_rgb_u8, kleidicv_thread_yuv_p_to_rgb_u8, 3); +} +TEST_P(YuvpThread, ToRGBA) { + check(kleidicv_yuv_p_to_rgba_u8, kleidicv_thread_yuv_p_to_rgba_u8, 4); +} + +TEST_F(YuvpThread, ReturnsErrorForUnsupportedCombinations) { + run_unsupported(kleidicv_thread_yuv_p_to_rgb_u8, 3, true); + run_unsupported(kleidicv_thread_yuv_p_to_rgba_u8, 4, true); + run_unsupported(kleidicv_thread_yuv_p_to_bgr_u8, 3, true); + run_unsupported(kleidicv_thread_yuv_p_to_bgra_u8, 4, true); + run_unsupported(kleidicv_thread_yuv_p_to_rgb_u8, 3, false); + run_unsupported(kleidicv_thread_yuv_p_to_rgba_u8, 4, false); + run_unsupported(kleidicv_thread_yuv_p_to_bgr_u8, 3, false); + run_unsupported(kleidicv_thread_yuv_p_to_bgra_u8, 4, false); +} + +INSTANTIATE_TEST_SUITE_P(, YuvpThread, + testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, + P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, + P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, + P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5}, + P{12, 37, 5}, P{16, 37, 5}, + P{2, 1000, 2})); diff --git a/test/api/test_thread_yuv_sp_to_rgb.cpp b/test/api/test_thread_yuv_sp_to_rgb.cpp index a358edbd6..694c23bd3 100644 --- a/test/api/test_thread_yuv_sp_to_rgb.cpp +++ b/test/api/test_thread_yuv_sp_to_rgb.cpp @@ -20,7 +20,8 @@ class YuvSpThread : public testing::TestWithParam

{ public: template void check(SingleThreadedFunc single_threaded_func, - MultithreadedFunc multithreaded_func, size_t channels) { + MultithreadedFunc multithreaded_func, size_t channels, + bool is_nv21) { unsigned width = 0, height = 0, thread_count = 0; std::tie(width, height, thread_count) = GetParam(); test::Array2D src_y(width, height), @@ -34,30 +35,77 @@ class YuvSpThread : public testing::TestWithParam

{ kleidicv_error_t single_result = single_threaded_func( src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), - dst_single.data(), dst_single.stride(), width, height, false); + dst_single.data(), dst_single.stride(), width, height, is_nv21); kleidicv_error_t multi_result = multithreaded_func( src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), - dst_multi.data(), dst_multi.stride(), width, height, false, + dst_multi.data(), dst_multi.stride(), width, height, is_nv21, get_multithreading_fake(thread_count)); EXPECT_EQ(KLEIDICV_OK, single_result); EXPECT_EQ(KLEIDICV_OK, multi_result); EXPECT_EQ_ARRAY2D(dst_multi, dst_single); } + + template + void run_unsupported(Func impl, size_t channels, bool is_nv21) { + test::Array2D src_y{20, 10}; + test::Array2D src_uv{20, 5}; + + test::Array2D dst{20 * channels, 10, 0, channels}; + + test::test_null_args(impl, src_y.data(), src_y.stride(), src_uv.data(), + src_uv.stride(), dst.data(), dst.stride(), dst.width(), + dst.height(), is_nv21, get_multithreading_fake(2)); + + EXPECT_EQ(KLEIDICV_OK, impl(src_y.data(), src_y.stride(), src_uv.data(), + src_uv.stride(), dst.data(), dst.stride(), 0, 1, + is_nv21, get_multithreading_fake(2))); + + EXPECT_EQ(KLEIDICV_OK, impl(src_y.data(), src_y.stride(), src_uv.data(), + src_uv.stride(), dst.data(), dst.stride(), 1, 0, + is_nv21, get_multithreading_fake(2))); + + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), + dst.data(), dst.stride(), KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, + is_nv21, get_multithreading_fake(2))); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(), + dst.data(), dst.stride(), KLEIDICV_MAX_IMAGE_PIXELS, + KLEIDICV_MAX_IMAGE_PIXELS - 1, is_nv21, + get_multithreading_fake(1))); + } }; TEST_P(YuvSpThread, ToBGR) { - check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3); + check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3, true); + check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3, false); } TEST_P(YuvSpThread, ToBGRA) { - check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4); + check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4, true); + check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4, + false); } TEST_P(YuvSpThread, ToRGB) { - check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3); + check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3, true); + check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3, false); } TEST_P(YuvSpThread, ToRGBA) { - check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4); + check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4, true); + check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4, + false); +} + +TEST_F(YuvSpThread, ReturnsErrorForUnsupportedCombinations) { + run_unsupported(kleidicv_thread_yuv_sp_to_rgb_u8, 3, true); + run_unsupported(kleidicv_thread_yuv_sp_to_rgba_u8, 4, true); + run_unsupported(kleidicv_thread_yuv_sp_to_bgr_u8, 3, true); + run_unsupported(kleidicv_thread_yuv_sp_to_bgra_u8, 4, true); + run_unsupported(kleidicv_thread_yuv_sp_to_rgb_u8, 3, false); + run_unsupported(kleidicv_thread_yuv_sp_to_rgba_u8, 4, false); + run_unsupported(kleidicv_thread_yuv_sp_to_bgr_u8, 3, false); + run_unsupported(kleidicv_thread_yuv_sp_to_bgra_u8, 4, false); } INSTANTIATE_TEST_SUITE_P(, YuvSpThread, diff --git a/test/api/test_yuv_p_to_rgb.cpp b/test/api/test_yuv_p_to_rgb.cpp new file mode 100644 index 000000000..a1c85de54 --- /dev/null +++ b/test/api/test_yuv_p_to_rgb.cpp @@ -0,0 +1,216 @@ +// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "test_config.h" + +class YUV420p2RGBTest : public testing::Test { + public: + struct TestParams { + size_t width; + size_t src_padding; + size_t dst_padding; + size_t height; + size_t channels; + bool v_first; + bool is_bgr; + }; + + static std::vector generate_test_cases( + const std::vector& widths, + const std::vector& src_paddings, + const std::vector& dst_paddings, + const std::vector& heights, const std::vector& channels, + const std::vector& uv_cases, + const std::vector& output_image_case) { + std::vector cases; + + for (size_t w : widths) { + for (size_t src_pad : src_paddings) { + for (size_t dst_pad : dst_paddings) { + for (size_t h : heights) { + for (size_t c : channels) { + for (bool uv_case : uv_cases) { + for (bool is_bgr : output_image_case) { + cases.push_back({w, src_pad, dst_pad, h, c, uv_case, is_bgr}); + } + } + } + } + } + } + } + + return cases; + } + + static std::vector get_test_cases() { + std::vector widths = {2, 4, 6, 18, 64}; + std::vector src_paddings = {4}; + std::vector dst_paddings = {0}; + std::vector heights = {2, 5, 11, 16}; + std::vector channels = {3, 4}; + std::vector uv_cases = {true, false}; + std::vector output_image_case = {true, false}; + return generate_test_cases(widths, src_paddings, dst_paddings, heights, + channels, uv_cases, output_image_case); + } + + void run_test_case(const TestParams& params) { + test::Array2D src{params.width, (params.height * 3 + 1) / 2, + params.src_padding}; + + test::Array2D expected_dst{params.width * params.channels, + params.height, params.dst_padding, + params.channels}; + + test::Array2D dst{params.width * params.channels, params.height, + params.dst_padding, params.channels}; + + test::PseudoRandomNumberGenerator input_value_random_range; + src.fill(input_value_random_range); + + calculate_referenc(src.data(), src.stride(), expected_dst.data(), + expected_dst.stride(), params.width, params.height, + params.v_first, params.is_bgr, params.channels); + + auto status = KLEIDICV_OK; + + if (params.channels == 3) { + if (!params.is_bgr) { + status = kleidicv_yuv_p_to_rgb_u8(src.data(), src.stride(), dst.data(), + dst.stride(), params.width, + params.height, params.v_first); + } else { + status = kleidicv_yuv_p_to_bgr_u8(src.data(), src.stride(), dst.data(), + dst.stride(), params.width, + params.height, params.v_first); + } + } + + if (params.channels == 4) { + if (!params.is_bgr) { + status = kleidicv_yuv_p_to_rgba_u8(src.data(), src.stride(), dst.data(), + dst.stride(), params.width, + params.height, params.v_first); + } else { + status = kleidicv_yuv_p_to_bgra_u8(src.data(), src.stride(), dst.data(), + dst.stride(), params.width, + params.height, params.v_first); + } + } + + EXPECT_EQ(KLEIDICV_OK, status); + EXPECT_EQ_ARRAY2D(expected_dst, dst); + } + + template + void run_unsupported(Func impl, size_t channels, bool v_first) { + test::Array2D src{20, (10 * 3 + 1) / 2}; + + test::Array2D dst{20 * channels, 10, 0, channels}; + + test::test_null_args(impl, src.data(), src.stride(), dst.data(), + dst.stride(), dst.width(), dst.height(), v_first); + + EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), dst.data(), + dst.stride(), 0, 1, v_first)); + + EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), dst.data(), + dst.stride(), 1, 0, v_first)); + + EXPECT_EQ(KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, v_first)); + EXPECT_EQ( + KLEIDICV_ERROR_RANGE, + impl(src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, v_first)); + } + + private: + static uint8_t saturate_cast_s32_to_u8(int32_t rhs) { + return static_cast( + std::min(std::max(0, rhs), + static_cast(std::numeric_limits::max()))); + } + void calculate_referenc(const uint8_t* src, size_t src_stride, uint8_t* dst, + size_t dst_stride, size_t width, size_t height, + bool v_first, bool BGR, size_t channels) { + // this will the pointer to u plane + const uint8_t* u = src + src_stride * height; + // this will the pointer to v plane + const uint8_t* v = src + src_stride * (height + height / 4) + + (width / 2) * ((height % 4) / 2); + size_t ustepIdx = 0; + size_t vstepIdx = height % 4 == 2 ? 1 : 0; + size_t uvsteps[2] = {width / 2, static_cast(src_stride) - width / 2}; + size_t usIdx = ustepIdx, vsIdx = vstepIdx; + const uint8_t* y1 = src; + const uint8_t* u1 = u; + const uint8_t* v1 = v; + for (size_t h = 0; h < height; h++) { + for (size_t w = 0; w < width; w++) { + // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign) + int32_t y = y1[h * src_stride + w]; + // NOLINTEND(clang-analyzer-core.uninitialized.Assign) + y = std::max(0, y - 16); + int32_t u = u1[w >> 1]; + u -= 128; + int32_t v = v1[w >> 1]; + v -= 128; + if (v_first) { + std::swap(u, v); + } + int32_t r = ((1220542 * y) + (1673527 * v) + (1 << 19)) >> 20; + int32_t g = + ((1220542 * y) - (409993 * u) - (852492 * v) + (1 << 19)) >> 20; + int32_t b = ((1220542 * y) + (2116026 * u) + (1 << 19)) >> 20; + + uint8_t r_u8 = saturate_cast_s32_to_u8(r); + uint8_t g_u8 = saturate_cast_s32_to_u8(g); + uint8_t b_u8 = saturate_cast_s32_to_u8(b); + + if (BGR) { + std::swap(b_u8, r_u8); + } + dst[h * dst_stride + w * channels + 0] = r_u8; + dst[h * dst_stride + w * channels + 1] = g_u8; + dst[h * dst_stride + w * channels + 2] = b_u8; + if (channels == 4) { + dst[h * dst_stride + w * channels + 3] = 0xff; + } + } + if ((h % 2) == 1) { + u1 += uvsteps[(usIdx++) & 1]; + v1 += uvsteps[(vsIdx++) & 1]; + } + } + } +}; + +TEST_F(YUV420p2RGBTest, ConvertspaddedInputsWithAllParamCombinations) { + for (const auto& params : get_test_cases()) { + run_test_case(params); + } +} + +TEST_F(YUV420p2RGBTest, ReturnsErrorForUnsupportedCombinations) { + run_unsupported(kleidicv_yuv_p_to_rgb_u8, 3, true); + run_unsupported(kleidicv_yuv_p_to_rgba_u8, 4, true); + run_unsupported(kleidicv_yuv_p_to_bgr_u8, 3, true); + run_unsupported(kleidicv_yuv_p_to_bgra_u8, 4, true); + run_unsupported(kleidicv_yuv_p_to_rgb_u8, 3, false); + run_unsupported(kleidicv_yuv_p_to_rgba_u8, 4, false); + run_unsupported(kleidicv_yuv_p_to_bgr_u8, 3, false); + run_unsupported(kleidicv_yuv_p_to_bgra_u8, 4, false); +} -- GitLab