From 26e326ef806db812badb7752e0cbd5c5df3e05ef Mon Sep 17 00:00:00 2001
From: Noureldin Abdelfattah <Noureldin.Abdelfattah@arm.com>
Date: Tue, 5 Aug 2025 15:32:36 +0100
Subject: [PATCH] Add YUV420p to RGBx & BGRx

---
 CHANGELOG.md                                  |   1 +
 adapters/opencv/kleidicv_hal.cpp              |  36 +++
 adapters/opencv/kleidicv_hal.h                |  15 +
 benchmark/benchmark.cpp                       |  35 ++-
 conformity/opencv/test_cvtcolor.cpp           |  13 +-
 doc/functionality.md                          |  44 +--
 doc/opencv.md                                 |  13 +-
 .../{yuv_sp_to_rgb.h => yuv_420_to_rgb.h}     | 107 ++++++-
 kleidicv/include/kleidicv/kleidicv.h          |  57 ++++
 kleidicv/src/conversions/yuv420_to_rgb_neon.h | 260 ++++++++++++++++++
 kleidicv/src/conversions/yuv420_to_rgb_sc.h   | 158 +++++++++++
 kleidicv/src/conversions/yuv_p_to_rgb_api.cpp |  58 ++++
 .../src/conversions/yuv_p_to_rgb_neon.cpp     | 248 +++++++++++++++++
 kleidicv/src/conversions/yuv_p_to_rgb_sc.h    | 231 ++++++++++++++++
 kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp |  40 +++
 .../src/conversions/yuv_p_to_rgb_sve2.cpp     |  43 +++
 .../src/conversions/yuv_sp_to_rgb_api.cpp     |   2 +-
 .../src/conversions/yuv_sp_to_rgb_neon.cpp    | 248 +----------------
 kleidicv/src/conversions/yuv_sp_to_rgb_sc.h   | 151 +---------
 .../include/kleidicv_thread/kleidicv_thread.h |  32 +++
 kleidicv_thread/src/kleidicv_thread.cpp       |  49 ++++
 scripts/benchmark/benchmarks.txt              |   4 +
 test/api/test_thread_yuv_p_to_rgb.cpp         | 105 +++++++
 test/api/test_thread_yuv_sp_to_rgb.cpp        |  62 ++++-
 test/api/test_yuv_p_to_rgb.cpp                | 216 +++++++++++++++
 25 files changed, 1810 insertions(+), 418 deletions(-)
 rename kleidicv/include/kleidicv/conversions/{yuv_sp_to_rgb.h => yuv_420_to_rgb.h} (51%)
 create mode 100644 kleidicv/src/conversions/yuv420_to_rgb_neon.h
 create mode 100644 kleidicv/src/conversions/yuv420_to_rgb_sc.h
 create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_api.cpp
 create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp
 create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_sc.h
 create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp
 create mode 100644 kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp
 create mode 100644 test/api/test_thread_yuv_p_to_rgb.cpp
 create mode 100644 test/api/test_yuv_p_to_rgb.cpp
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 540529b88..3bac3898d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ This changelog aims to follow the guiding principles of
 - Gaussian Blur for any odd kernel size (up to 255x255) with replicated borders
 - Conversion from packed YUV 4:4:4 (interleaved and non-subsampled) to RGBA/BGRA.
 - Add SME2 version of saturating add with multivector loads and stores. It is marked as experimental as it is not covered by CI as of now.
+- Conversion from YUV 4:2:0 planar (I420/YV12) to RGBA/BGRA.
 
 ### Changed
 - Performance of Gaussian Blur is greatly improved in return for some accuracy.
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 7fa9209f0..6f8af0de2 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -241,6 +241,42 @@ int yuv_to_bgr_sp_ex(const uchar *y_data, size_t y_step, const uchar *uv_data,
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
+int yuv_to_bgr_p(const uchar *src_data, size_t src_step, uchar *dst_data,
+                 size_t dst_step, int dst_width, int dst_height, int dcn,
+                 bool swapBlue, int uIdx) {
+  const bool is_bgr = !swapBlue;
+  const bool is_nv21 = (uIdx != 0);
+  auto mt = get_multithreading();
+
+  if (dcn == 3) {
+    if (is_bgr) {
+      return convert_error(kleidicv_thread_yuv_p_to_bgr_u8(
+          reinterpret_cast<const uint8_t *>(src_data), src_step,
+          reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width,
+          dst_height, is_nv21, mt));
+    }
+    return convert_error(kleidicv_thread_yuv_p_to_rgb_u8(
+        reinterpret_cast<const uint8_t *>(src_data), src_step,
+        reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width, dst_height,
+        is_nv21, mt));
+  }
+
+  if (dcn == 4) {
+    if (is_bgr) {
+      return convert_error(kleidicv_thread_yuv_p_to_bgra_u8(
+          reinterpret_cast<const uint8_t *>(src_data), src_step,
+          reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width,
+          dst_height, is_nv21, mt));
+    }
+    return convert_error(kleidicv_thread_yuv_p_to_rgba_u8(
+        reinterpret_cast<const uint8_t *>(src_data), src_step,
+        reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width, dst_height,
+        is_nv21, mt));
+  }
+
+  return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
 int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data,
                size_t dst_step, int width, int height, int depth, int dcn,
                bool swapBlue, bool isCbCr) {
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 0c9cd5e6b..4d3c95989 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -38,6 +38,10 @@ int yuv_to_bgr_sp_ex(const uchar *y_data, size_t y_step, const uchar *uv_data,
                      int dst_width, int dst_height, int dcn, bool swapBlue,
                      int uIdx);
 
+int yuv_to_bgr_p(const uchar *src_data, size_t src_step, uchar *dst_data,
+                 size_t dst_step, int dst_width, int dst_height, int dcn,
+                 bool swapBlue, int uIdx);
+
 int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data,
                size_t dst_step, int width, int height, int depth, int dcn,
                bool swapBlue, bool isCbCr);
@@ -238,6 +242,17 @@ static inline int kleidicv_yuv_to_bgr_sp_ex_with_fallback(
 #undef cv_hal_cvtTwoPlaneYUVtoBGREx
 #define cv_hal_cvtTwoPlaneYUVtoBGREx kleidicv_yuv_to_bgr_sp_ex_with_fallback
 
+// yuv_to_bgr_p
+static inline int kleidicv_yuv_to_bgr_p_with_fallback(
+    const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step,
+    int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(
+      yuv_to_bgr_p, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data,
+      dst_step, dst_width, dst_height, dcn, swapBlue, uIdx);
+}
+#undef cv_hal_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGR kleidicv_yuv_to_bgr_p_with_fallback
+
 // yuv_to_bgr
 static inline int kleidicv_yuv_to_bgr_with_fallback(
     const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step,
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 218fb8902..6ee20f15d 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -715,10 +715,11 @@ template <size_t OutChannels, typename Function>
 static void yuv_sp(Function f, benchmark::State& state) {
   bench_functor(state, [f]() {
     (void)f(get_source_buffer_a<uint8_t, 1>(), image_width * sizeof(uint8_t),
-            get_source_buffer_b<uint8_t, 2>(),
+            get_source_buffer_b<uint8_t, 1>(),
             (image_width / 2) * sizeof(uint8_t),
             get_destination_buffer_a<uint8_t, OutChannels>(),
-            image_width * sizeof(uint8_t), image_width, image_height, true);
+            image_width * OutChannels * sizeof(uint8_t), image_width,
+            image_height, true);
   });
 }
 
@@ -742,6 +743,36 @@ static void yuv_sp_to_bgra(benchmark::State& state) {
 }
 BENCHMARK(yuv_sp_to_bgra);
 
+template <size_t OutChannels, typename Function>
+static void yuv_p(Function f, benchmark::State& state) {
+  bench_functor(state, [f]() {
+    (void)f(get_source_buffer_a<uint8_t, 2>(), image_width * sizeof(uint8_t),
+            get_destination_buffer_a<uint8_t, OutChannels>(),
+            image_width * OutChannels * sizeof(uint8_t), image_width,
+            image_height, true);
+  });
+}
+
+static void yuv_p_to_rgb(benchmark::State& state) {
+  yuv_p<3>(kleidicv_yuv_p_to_rgb_u8, state);
+}
+BENCHMARK(yuv_p_to_rgb);
+
+static void yuv_p_to_bgr(benchmark::State& state) {
+  yuv_p<3>(kleidicv_yuv_p_to_bgr_u8, state);
+}
+BENCHMARK(yuv_p_to_bgr);
+
+static void yuv_p_to_rgba(benchmark::State& state) {
+  yuv_p<4>(kleidicv_yuv_p_to_rgba_u8, state);
+}
+BENCHMARK(yuv_p_to_rgba);
+
+static void yuv_p_to_bgra(benchmark::State& state) {
+  yuv_p<4>(kleidicv_yuv_p_to_bgra_u8, state);
+}
+BENCHMARK(yuv_p_to_bgra);
+
 template <typename T, size_t KernelSize, typename Function>
 static void morphology(Function f, benchmark::State& state) {
   kleidicv_morphology_context_t* context = nullptr;
diff --git a/conformity/opencv/test_cvtcolor.cpp b/conformity/opencv/test_cvtcolor.cpp
index 53425f39c..232aa28a7 100644
--- a/conformity/opencv/test_cvtcolor.cpp
+++ b/conformity/opencv/test_cvtcolor.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -33,7 +33,8 @@ bool test_cvtcolor(int index, RecreatedMessageQueue& request_queue,
     return false;
   };
 
-  // OpenCV only accepts two-plane images with an even number of columns & rows.
+  // OpenCV only accepts images with an even number of columns & rows to some
+  // YUV formats like YUV420.
   for (size_t x = 4; x <= 16; x += 2) {
     for (size_t y = 2; y <= 16; y += 2) {
       if (check(x, y)) {
@@ -58,6 +59,14 @@ bool test_cvtcolor(int index, RecreatedMessageQueue& request_queue,
 std::vector<test>& cvtcolor_tests_get() {
   // clang-format off
   static std::vector<test> tests = {
+    CVTCOLOR_TEST(YUV2BGR_YV12),
+    CVTCOLOR_TEST(YUV2BGRA_YV12),
+    CVTCOLOR_TEST(YUV2RGB_YV12),
+    CVTCOLOR_TEST(YUV2RGBA_YV12),
+    CVTCOLOR_TEST(YUV2BGR_IYUV),
+    CVTCOLOR_TEST(YUV2BGRA_IYUV),
+    CVTCOLOR_TEST(YUV2RGB_IYUV),
+    CVTCOLOR_TEST(YUV2RGBA_IYUV),
     CVTCOLOR_TEST(YUV2BGR_NV12),
     CVTCOLOR_TEST(YUV2RGB_NV12),
     CVTCOLOR_TEST(YUV2BGRA_NV12),
diff --git a/doc/functionality.md b/doc/functionality.md
index 43ae983c8..765af9c1e 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -29,26 +29,30 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 | Bitwise And                  |  x  |
 
 ## Color conversions
-|              | u8  |
-|--------------|-----|
-| Gray-RGB     |  x  |
-| Gray-RGBA    |  x  |
-| RGB-BGR      |  x  |
-| BGR-RGB      |  x  |
-| RGBA-BGRA    |  x  |
-| BGRA-RGBA    |  x  |
-| YUV420-BGR   |  x  |
-| YUV420-BGRA  |  x  |
-| YUV420-RGB   |  x  |
-| YUV420-RGBA  |  x  |
-| YUV-BGR      |  x  |
-| YUV-RGB      |  x  |
-| YUV-BGRA     |  x  |
-| YUV-RGBA     |  x  |
-| RGB-YUV      |  x  |
-| RGBA-YUV     |  x  |
-| BGR-YUV      |  x  |
-| BGRA-YUV     |  x  |
+|                              | u8  |
+|------------------------------|-----|
+| Gray-RGB                     |  x  |
+| Gray-RGBA                    |  x  |
+| RGB-BGR                      |  x  |
+| BGR-RGB                      |  x  |
+| RGBA-BGRA                    |  x  |
+| BGRA-RGBA                    |  x  |
+| YUV420 (planar) - BGR        |  x  |
+| YUV420 (planar) - BGRA       |  x  |
+| YUV420 (planar) - RGB        |  x  |
+| YUV420 (planar) - RGBA       |  x  |
+| YUV420 (semi-planar) - BGR   |  x  |
+| YUV420 (semi-planar) - BGRA  |  x  |
+| YUV420 (semi-planar) - RGB   |  x  |
+| YUV420 (semi-planar) - RGBA  |  x  |
+| YUV-BGR                      |  x  |
+| YUV-RGB                      |  x  |
+| YUV-BGRA                     |  x  |
+| YUV-RGBA                     |  x  |
+| RGB-YUV                      |  x  |
+| RGBA-YUV                     |  x  |
+| BGR-YUV                      |  x  |
+| BGRA-YUV                     |  x  |
 
 ## Data type conversions
 |            | u8  | s8  | f32 |
diff --git a/doc/opencv.md b/doc/opencv.md
index 6d0021177..e5303182a 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -67,12 +67,13 @@ Notes on parameters:
 * `src.channels()` - supports 3 for RGB and 4 for RGBA.
 * `dst.channels()` - supports 3 for RGB and 4 for RGBA.
 
-#### [`COLOR_YUV2RGB_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a35687717fabb536c1e1ec0857714aaf9),[`COLOR_YUV2BGR_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a305e5da3816c78b3d1ffa0498424e94f),[`COLOR_YUV2RGBA_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a18346327c937bca2aa2856914ff11507),[`COLOR_YUV2BGRA_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a0ffa81c19231ddd2e9cee8616a3a4673)
-YUV420 to RGB/RGBA image conversion (semi-planar). Function accepts Y plane and UV planes separately.\
-All supported permutations are listed in the table below.
-|   | RGB | BGR | RGBA | BGRA |
-|---|-----|-----|------|------|
-|YUV|  x  |  x  |   x  |  x   |
+#### [`COLOR_YUV2RGB_I420`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a35687717fabb536c1e1ec0857714aaf9:~:text=V%2C%20see%20color_convert_rgb_yuv_42x-,COLOR_YUV2RGB_I420,-Python%3A%20cv.COLOR_YUV2RGB_I420), [`COLOR_YUV2RGB_NV12`](https://docs.opencv.org/4.10.0/d8/d01/group__imgproc__color__conversions.html#gga4e0972be5de079fed4e3a10e24ef5ef0a35687717fabb536c1e1ec0857714aaf9:~:text=Python%3A%20cv.COLOR_YUV2RGB-,COLOR_YUV2RGB_NV12,-Python%3A%20cv.COLOR_YUV2RGB_NV12)
+YUV420 to RGB/RGBA image conversion supporting **both planar (I420/YV12)** and **semi-planar (NV12/NV21)** layouts.  
+All supported permutations are shown below:
+| YUV Layout    | RGB | BGR | RGBA | BGRA |
+|---------------|-----|-----|------|------|
+| Planar        |  x  |  x  |  x   |  x   |
+| Semi-planar   |  x  |  x  |  x   |  x   |
 
 Notes on parameters:
 * `dst.channels()` - supports 3 for RGB and 4 for RGBA.
diff --git a/kleidicv/include/kleidicv/conversions/yuv_sp_to_rgb.h b/kleidicv/include/kleidicv/conversions/yuv_420_to_rgb.h
similarity index 51%
rename from kleidicv/include/kleidicv/conversions/yuv_sp_to_rgb.h
rename to kleidicv/include/kleidicv/conversions/yuv_420_to_rgb.h
index f9dda4952..6e2c10c0f 100644
--- a/kleidicv/include/kleidicv/conversions/yuv_sp_to_rgb.h
+++ b/kleidicv/include/kleidicv/conversions/yuv_420_to_rgb.h
@@ -2,11 +2,46 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef KLEIDICV_CONVERSIONS_YUV_SP_TO_RGB_H
-#define KLEIDICV_CONVERSIONS_YUV_SP_TO_RGB_H
+#ifndef KLEIDICV_CONVERSIONS_YUV_420_TO_RGB_H
+#define KLEIDICV_CONVERSIONS_YUV_420_TO_RGB_H
 
 #include "kleidicv/kleidicv.h"
 
+extern "C" {
+
+// For internal use only. See instead kleidicv_yuv_p_to_rgb_u8.
+// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12)
+// to RGB format. The stripe is defined by the range [begin, end].
+KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_rgb_stripe_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height, bool v_first,
+                         size_t begin, size_t end);
+
+// For internal use only. See instead kleidicv_yuv_p_to_rgba_u8.
+// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12)
+// to RGBA format. The stripe is defined by the range [begin, end].
+KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_rgba_stripe_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height, bool v_first,
+                         size_t begin, size_t end);
+
+// For internal use only. See instead kleidicv_yuv_p_to_bgr_u8.
+// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12)
+// to BGR format. The stripe is defined by the range [begin, end].
+KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_bgr_stripe_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height, bool v_first,
+                         size_t begin, size_t end);
+
+// For internal use only. See instead kleidicv_yuv_p_to_bgra_u8.
+// Converts a stripe (range of rows) of a planar YUV420 image (I420 or YV12)
+// to BGRA format. The stripe is defined by the range [begin, end].
+KLEIDICV_API_DECLARATION(kleidicv_yuv_p_to_bgra_stripe_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height, bool v_first,
+                         size_t begin, size_t end);
+}
+
 namespace kleidicv {
 
 /* Analog YUV to RGB conversion according to ITU-R BT.601-7 in matrix form:
@@ -106,6 +141,28 @@ kleidicv_error_t yuv_sp_to_bgra_u8(const uint8_t *src_y, size_t src_y_stride,
                                    const uint8_t *src_uv, size_t src_uv_stride,
                                    uint8_t *dst, size_t dst_stride,
                                    size_t width, size_t height, bool is_nv21);
+
+kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin, size_t end);
+
+kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end);
+
+kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin, size_t end);
+
+kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end);
 }  // namespace neon
 
 namespace sve2 {
@@ -128,6 +185,28 @@ kleidicv_error_t yuv_sp_to_bgra_u8(const uint8_t *src_y, size_t src_y_stride,
                                    const uint8_t *src_uv, size_t src_uv_stride,
                                    uint8_t *dst, size_t dst_stride,
                                    size_t width, size_t height, bool is_nv21);
+
+kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin, size_t end);
+
+kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end);
+
+kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin, size_t end);
+
+kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end);
 }  // namespace sve2
 
 namespace sme {
@@ -150,8 +229,30 @@ kleidicv_error_t yuv_sp_to_bgra_u8(const uint8_t *src_y, size_t src_y_stride,
                                    const uint8_t *src_uv, size_t src_uv_stride,
                                    uint8_t *dst, size_t dst_stride,
                                    size_t width, size_t height, bool is_nv21);
+
+kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin, size_t end);
+
+kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end);
+
+kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin, size_t end);
+
+kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end);
 }  // namespace sme
 
 }  // namespace kleidicv
 
-#endif  // KLEIDICV_CONVERSIONS_YUV_SP_TO_RGB_H
+#endif  // KLEIDICV_CONVERSIONS_YUV_420_TO_RGB_H
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 09761af88..cdb2e0160 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -721,6 +721,63 @@ KLEIDICV_API_DECLARATION(kleidicv_yuv_sp_to_bgra_u8, const uint8_t *src_y,
                          size_t src_uv_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, bool is_nv21);
 
+#define KLEIDICV_OP_YUVP_TO_RGB(name)                                        \
+  kleidicv_error_t name(const uint8_t *src, size_t src_stride, uint8_t *dst, \
+                        size_t dst_stride, size_t width, size_t height,      \
+                        bool v_first)
+
+/// Converts a planar YUV420 image (I420 or YV12 layout) to RGB, RGBA, BGR, or
+/// BGRA format. All channels are 8-bit wide. If the output format includes an
+/// alpha channel, the alpha value is set to 0xFF.
+///
+/// ### Source format: Planar YUV420
+/// The input buffer consists of three planes stored sequentially in memory:
+/// - Y plane:   full resolution, size = width × height
+/// - U plane:   quarter resolution, size = (width / 2) × (height / 2)
+/// - V plane:   quarter resolution, size = (width / 2) × (height / 2)
+///
+/// ### Destination format
+/// Destination data uses interleaved pixel layout with 3 or 4 channels per
+/// pixel:
+/// - R, G, B
+/// - B, G, R
+/// - R, G, B, Alpha
+/// - B, G, R, Alpha
+///
+/// One pixel occupies 3 or 4 bytes, depending on the format.
+///
+/// Width and height refer to the logical image dimensions, i.e., number of
+/// pixels per row and number of rows. The total number of pixels must not
+/// exceed @ref KLEIDICV_MAX_IMAGE_PIXELS.
+///
+/// @param src         Pointer to the source buffer containing Y + U + V or Y +
+///                    V + U in sequential planes.
+///                    Must be non-null.
+/// @param src_stride  Stride (in bytes) between rows in the Y plane.
+///                    Must be at least `width`. This same stride is reused to
+///                    compute row access in the U and V planes, which follow
+///                    the Y plane in memory. In such memory layouts
+///                    (e.g., OpenCV’s I420/YV12), the U and V planes are
+///                    located directly after the Y plane, and their row
+///                    stepping can be expressed as: `uvsteps[2] = { width / 2,
+///                    src_stride - width / 2 }`. This ensures correct row
+///                    traversal across the subsampled chroma planes.
+/// @param dst         Pointer to the destination data. Must be non-null.
+/// @param dst_stride  Byte offset between the start of one destination row and
+///                    the next. Must be at least `(destination channel count) *
+///                    width`, unless the image has only one row.
+/// @param width       Number of pixels in a row.
+/// @param height      Number of rows in the data.
+/// @param v_first    If true, treat the layout as YV12 (Y + V + U). Otherwise,
+///                    I420 (Y + U + V).
+KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_rgb_u8);
+/// @copydoc kleidicv_yuv_p_to_rgb_u8
+KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_rgba_u8);
+/// @copydoc kleidicv_yuv_p_to_rgb_u8
+KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_bgr_u8);
+/// @copydoc kleidicv_yuv_p_to_rgb_u8
+KLEIDICV_OP_YUVP_TO_RGB(kleidicv_yuv_p_to_bgra_u8);
+
 /// Converts a YUV image to RGB or RGBA, pixel by pixel. All channels are 8-bit
 /// wide.
 ///
diff --git a/kleidicv/src/conversions/yuv420_to_rgb_neon.h b/kleidicv/src/conversions/yuv420_to_rgb_neon.h
new file mode 100644
index 000000000..685cd8cd4
--- /dev/null
+++ b/kleidicv/src/conversions/yuv420_to_rgb_neon.h
@@ -0,0 +1,260 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_YUV420_TO_RGB_NEON_H
+#define KLEIDICV_YUV420_TO_RGB_NEON_H
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/traits.h"
+
+namespace kleidicv::neon {
+
+template <bool BGR, bool ALPHA>
+class YUV420XToRGBxOrBGRx {
+ public:
+  using ScalarType = uint8_t;
+  using VectorType = uint8x16_t;
+
+  int32x4_t y_weight_;
+  int32x2x2_t uv_weights_;
+  int32x4_t r_base_, g_base_, b_base_;
+  int8x16x4_t de_interleave_indices_;
+  const bool v_first_;
+
+  // Returns the number of channels in the output image.
+  static constexpr size_t output_channels() {
+    return ALPHA ? /* RGBA */ 4 : /* RGB */ 3;
+  }
+
+  static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
+    return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)),
+                        vmovn_s32(vshrq_n_s32(b, kWeightScale)));
+  }
+
+  // clang-format off
+
+  static constexpr int8_t kDeInterleaveTableIndices[64] = {
+      /* low and even */
+      0, -1, -1, -1,  2, -1, -1, -1,  4, -1, -1, -1,  6, -1, -1, -1,
+      /* high and even */
+      8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1,
+      /* low and odd */
+      1, -1, -1, -1,  3, -1, -1, -1,  5, -1, -1, -1,  7, -1, -1, -1,
+      /* high and odd */
+      9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
+  };
+
+  // clang-format on
+
+  explicit YUV420XToRGBxOrBGRx(bool v_first)
+      : y_weight_{vdupq_n_s32(kYWeight)},
+        uv_weights_{vld2_s32(kUVWeights)},
+        r_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
+                            128 * kUVWeights[kRVWeightIndex])},
+        g_base_{vdupq_n_s32((1 << (kWeightScale - 1)) -
+                            128 * (kUVWeights[1] + kUVWeights[2]))},
+        b_base_{vdupq_n_s32((1 << (kWeightScale - 1)) - 128 * kUVWeights[3])},
+        de_interleave_indices_{},
+        v_first_{v_first} {
+    neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
+                                  de_interleave_indices_);
+  }
+
+  void yuv420x_to_rgb(VectorType y0, VectorType y1, int32x4_t u_l,
+                      int32x4_t u_h, int32x4_t v_l, int32x4_t v_h,
+                      ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
+    // Y' = saturating(Ya - 16) and widen to 32-bits.
+    uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16));
+    uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16));
+
+    uint32x4_t y0_m16_even_l =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0]));
+    uint32x4_t y0_m16_even_h =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1]));
+    uint32x4_t y0_m16_odd_l =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2]));
+    uint32x4_t y0_m16_odd_h =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3]));
+
+    uint32x4_t y1_m16_even_l =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0]));
+    uint32x4_t y1_m16_even_h =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1]));
+    uint32x4_t y1_m16_odd_l =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2]));
+    uint32x4_t y1_m16_odd_h =
+        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3]));
+
+    // Y = Weight(Y) * Y'
+    y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_);
+    y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_);
+    y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_);
+    y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_);
+
+    y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_);
+    y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_);
+    y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_);
+    y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_);
+
+    // Swap U and V planes for YV12 layout.
+    if (v_first_) {
+      std::swap(u_l, v_l);
+      std::swap(u_h, v_h);
+    }
+
+    // R - Y = Rbase + Weight(RV) * V =
+    //         Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
+    int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0);
+    int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0);
+
+    // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
+    //         Weight(GU) * ((1 << (SCALE - 1)) - 128) +
+    //         Weight(GV) * ((1 << (SCALE - 1)) - 128) +
+    //         Weight(GU) * U + Weight(GV) * V
+    int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0);
+    int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0);
+    g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1);
+    g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1);
+
+    // B - Y = Bbase + Weight(BU) * U =
+    //         Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
+    int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1);
+    int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1);
+
+    // R = (R - Y) + Y
+    int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l);
+    int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h);
+    int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l);
+    int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h);
+    int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h);
+    int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h);
+
+    int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l);
+    int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h);
+    int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l);
+    int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h);
+    int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h);
+    int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h);
+
+    // G = (G - Y) + Y
+    int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l);
+    int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h);
+    int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l);
+    int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h);
+    int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h);
+    int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h);
+
+    int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l);
+    int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h);
+    int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l);
+    int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h);
+    int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h);
+    int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h);
+
+    // B = (B - Y) + Y
+    int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l);
+    int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h);
+    int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l);
+    int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h);
+    int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h);
+    int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h);
+
+    int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l);
+    int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h);
+    int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l);
+    int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h);
+    int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h);
+    int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h);
+
+    // Zip even and odd RGB pixels.
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+    if constexpr (ALPHA) {
+      uint8x16x4_t rgba0, rgba1;
+      // Red channel
+      rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
+      rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
+      // Green channel
+      rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
+      rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
+      // Blue channel
+      rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
+      rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
+      // Alpha channel
+      rgba0.val[3] = vdupq_n_u8(0xFF);
+      rgba1.val[3] = vdupq_n_u8(0xFF);
+
+      if constexpr (BGR) {
+        std::swap(rgba0.val[0], rgba0.val[2]);
+        std::swap(rgba1.val[0], rgba1.val[2]);
+      }
+
+      // Store RGB pixels to memory.
+      vst4q_u8(rgbx_row_0, rgba0);
+      vst4q_u8(rgbx_row_1, rgba1);
+    } else {
+      uint8x16x3_t rgb0, rgb1;
+      // Red channel
+      rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
+      rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
+      // Green channel
+      rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
+      rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
+      // Blue channel
+      rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
+      rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
+
+      if constexpr (BGR) {
+        std::swap(rgb0.val[0], rgb0.val[2]);
+        std::swap(rgb1.val[0], rgb1.val[2]);
+      }
+
+      // Store RGB pixels to memory.
+      vst3q_u8(rgbx_row_0, rgb0);
+      vst3q_u8(rgbx_row_1, rgb1);
+    }
+  }
+
+  void yuv420x_to_rgb(const uint8_t *y_rows[2], size_t index, int32_t u_m128,
+                      int32_t v_m128, uint8_t *rgbx_rows[2]) {
+    for (size_t selector = 0; selector < 2; ++selector) {
+      int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0);
+      int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128;
+      int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 +
+                  kUVWeights[kGVWeightIndex] * v_m128;
+      int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128;
+
+      r = rounding_shift_right(r, kWeightScale);
+      g = rounding_shift_right(g, kWeightScale);
+      b = rounding_shift_right(b, kWeightScale);
+
+      if constexpr (BGR) {
+        std::swap(r, b);
+      }
+
+      rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r);
+      rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
+      rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b);
+
+      if constexpr (ALPHA) {
+        rgbx_rows[selector][3] = 0xFF;
+      }
+
+      rgbx_rows[selector] += ALPHA ? 4 : 3;
+    }
+  }
+};
+}  // namespace kleidicv::neon
+
+#endif  // KLEIDICV_YUV420_TO_RGB_NEON_H
diff --git a/kleidicv/src/conversions/yuv420_to_rgb_sc.h b/kleidicv/src/conversions/yuv420_to_rgb_sc.h
new file mode 100644
index 000000000..af7f3c552
--- /dev/null
+++ b/kleidicv/src/conversions/yuv420_to_rgb_sc.h
@@ -0,0 +1,158 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_YUV420_TO_RGB_SC_H
+#define KLEIDICV_YUV420_TO_RGB_SC_H
+
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/sve2.h"
+
+namespace KLEIDICV_TARGET_NAMESPACE {
+
+template <bool BGR, bool ALPHA>
+class YUV420XToRGBxOrBGRx {
+ public:
+  const bool v_first_;
+  explicit YUV420XToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
+      : v_first_{v_first} {}
+  void yuv420x_to_rgb(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u,
+                      svint16_t &v, uint8_t *rgbx_row_0,
+                      uint8_t *rgbx_row_1) KLEIDICV_STREAMING {
+    // Both the rounding shift right constant and the -128 value are included.
+    constexpr int32_t kOffset = 1 << (kWeightScale - 1);
+    svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]);
+    svint32_t g_base =
+        svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2]));
+    svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]);
+
+    // Y' = saturating(Ya - 16) and widen to signed 32-bits.
+    svuint8_t y0_m16 = svqsub(y0, static_cast<uint8_t>(16));
+    svuint16_t y0_m16_b = svmovlb(y0_m16);  // 'b' means bottom
+    svuint16_t y0_m16_t = svmovlt(y0_m16);  // 't' means top
+    svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b));
+    svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b));
+    svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t));
+    svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t));
+
+    svuint8_t y1_m16 = svqsub(y1, static_cast<uint8_t>(16));
+    svuint16_t y1_m16_b = svmovlb(y1_m16);
+    svuint16_t y1_m16_t = svmovlt(y1_m16);
+    svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b));
+    svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b));
+    svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t));
+    svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t));
+
+    // Y = Weight(Y) * Y'
+    y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight);
+    y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight);
+    y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight);
+    y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight);
+
+    y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight);
+    y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight);
+    y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight);
+    y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight);
+
+    // Swap U and V planes for YV12 layout.
+    if (v_first_) {
+      swap_scalable(u, v);
+    }
+
+    svint32_t u_b = svmovlb(u);
+    svint32_t u_t = svmovlt(u);
+    svint32_t v_b = svmovlb(v);
+    svint32_t v_t = svmovlt(v);
+
+    // R - Y = Rbase + Weight(RV) * V =
+    //         Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
+    svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]);
+    svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]);
+
+    // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
+    //         Weight(GU) * ((1 << (SCALE - 1)) - 128) +
+    //         Weight(GV) * ((1 << (SCALE - 1)) - 128) +
+    //         Weight(GU) * U + Weight(GV) * V
+    svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]);
+    svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]);
+    g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]);
+    g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]);
+
+    // B - Y = Bbase + Weight(BU) * U =
+    //         Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
+    svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]);
+    svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]);
+
+    // R = (R - Y) + Y
+    // FIXME: There are too many instructions here.
+    // Is there a better way to do this?
+    svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb);
+    r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt);
+    r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16);
+    svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb);
+    r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt);
+    r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16);
+    svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t);
+
+    svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb);
+    r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt);
+    r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16);
+    svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb);
+    r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt);
+    r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16);
+    svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t);
+
+    // G = (G - Y) + Y
+    svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb);
+    g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt);
+    g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16);
+    svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb);
+    g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt);
+    g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16);
+    svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t);
+
+    svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb);
+    g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt);
+    g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16);
+    svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb);
+    g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt);
+    g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16);
+    svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t);
+
+    // B = (B - Y) + Y
+    svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb);
+    b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt);
+    b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16);
+    svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb);
+    b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt);
+    b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16);
+    svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t);
+
+    svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb);
+    b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt);
+    b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16);
+    svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb);
+    b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt);
+    b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16);
+    svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t);
+
+    if constexpr (ALPHA) {
+      svuint8x4_t rgba0 =
+          svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF));
+      svuint8x4_t rgba1 =
+          svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF));
+      // Store RGBA pixels to memory.
+      svst4_u8(pg, rgbx_row_0, rgba0);
+      svst4_u8(pg, rgbx_row_1, rgba1);
+    } else {
+      svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0);
+      svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1);
+      // Store RGB pixels to memory.
+      svst3(pg, rgbx_row_0, rgb0);
+      svst3(pg, rgbx_row_1, rgb1);
+    }
+  }
+};
+}  // namespace KLEIDICV_TARGET_NAMESPACE
+
+#endif  // KLEIDICV_YUV420_TO_RGB_SC_H
diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_api.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_api.cpp
new file mode 100644
index 000000000..c467a3628
--- /dev/null
+++ b/kleidicv/src/conversions/yuv_p_to_rgb_api.cpp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
+#include "kleidicv/dispatch.h"
+#include "kleidicv/kleidicv.h"
+
+#define KLEIDICV_DEFINE_C_API(name, partialname)                  \
+  KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::partialname, \
+                              &kleidicv::sve2::partialname,       \
+                              &kleidicv::sme::partialname, nullptr)
+
+KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_rgb_stripe_u8, yuv_p_to_rgb_stripe_u8);
+
+KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_bgr_stripe_u8, yuv_p_to_bgr_stripe_u8);
+
+KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_rgba_stripe_u8,
+                      yuv_p_to_rgba_stripe_u8);
+
+KLEIDICV_DEFINE_C_API(kleidicv_yuv_p_to_bgra_stripe_u8,
+                      yuv_p_to_bgra_stripe_u8);
+
+extern "C" {
+
+kleidicv_error_t kleidicv_yuv_p_to_rgb_u8(const uint8_t *src, size_t src_stride,
+                                          uint8_t *dst, size_t dst_stride,
+                                          size_t width, size_t height,
+                                          bool v_first) {
+  return kleidicv_yuv_p_to_rgb_stripe_u8(src, src_stride, dst, dst_stride,
+                                         width, height, v_first, 0, height);
+}
+
+kleidicv_error_t kleidicv_yuv_p_to_bgr_u8(const uint8_t *src, size_t src_stride,
+                                          uint8_t *dst, size_t dst_stride,
+                                          size_t width, size_t height,
+                                          bool v_first) {
+  return kleidicv_yuv_p_to_bgr_stripe_u8(src, src_stride, dst, dst_stride,
+                                         width, height, v_first, 0, height);
+}
+
+kleidicv_error_t kleidicv_yuv_p_to_rgba_u8(const uint8_t *src,
+                                           size_t src_stride, uint8_t *dst,
+                                           size_t dst_stride, size_t width,
+                                           size_t height, bool v_first) {
+  return kleidicv_yuv_p_to_rgba_stripe_u8(src, src_stride, dst, dst_stride,
+                                          width, height, v_first, 0, height);
+}
+
+kleidicv_error_t kleidicv_yuv_p_to_bgra_u8(const uint8_t *src,
+                                           size_t src_stride, uint8_t *dst,
+                                           size_t dst_stride, size_t width,
+                                           size_t height, bool v_first) {
+  return kleidicv_yuv_p_to_bgra_stripe_u8(src, src_stride, dst, dst_stride,
+                                          width, height, v_first, 0, height);
+}
+
+}  // extern "C"
diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp
new file mode 100644
index 000000000..d37d7e873
--- /dev/null
+++ b/kleidicv/src/conversions/yuv_p_to_rgb_neon.cpp
@@ -0,0 +1,248 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <utility>
+
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/neon.h"
+#include "yuv420_to_rgb_neon.h"
+
+namespace kleidicv::neon {
+template <bool BGR, bool ALPHA>
+class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, ALPHA>,
+                               public UnrollOnce,
+                               public TryToAvoidTailLoop {
+ public:
+  using VecTraits = neon::VecTraits<uint8_t>;
+  using ScalarType = VecTraits::ScalarType;
+  using VectorType = VecTraits::VectorType;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::de_interleave_indices_;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::yuv420x_to_rgb;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::v_first_;
+
+  explicit YUVpToRGBxOrBGRx(bool v_first)
+      : YUV420XToRGBxOrBGRx<BGR, ALPHA>(v_first) {}
+
+  void vector_path(VectorType &y0, VectorType &y1, VectorType &y2,
+                   VectorType &y3, VectorType &u, VectorType &v,
+                   ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
+    // Indices to extract every 4 bytes into 4x 32-bit slots (0xff = ignore)
+    // These are needed to expand each group of 4 bytes into a full 32-bit lane
+    uint8x16_t index_lo_lo = {0, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff,
+                              2, 0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff};
+
+    uint8x16_t index_lo_hi = {4, 0xff, 0xff, 0xff, 5, 0xff, 0xff, 0xff,
+                              6, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff};
+
+    uint8x16_t index_hi_lo = {8,  0xff, 0xff, 0xff, 9,  0xff, 0xff, 0xff,
+                              10, 0xff, 0xff, 0xff, 11, 0xff, 0xff, 0xff};
+
+    uint8x16_t index_hi_hi = {12, 0xff, 0xff, 0xff, 13, 0xff, 0xff, 0xff,
+                              14, 0xff, 0xff, 0xff, 15, 0xff, 0xff, 0xff};
+
+    // Expand each 8-bit channel into 32-bit vectors using table lookup and
+    // reinterpret
+    int32x4_t u_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_lo));
+    int32x4_t u_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_lo_hi));
+    int32x4_t u_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_lo));
+    int32x4_t u_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(u, index_hi_hi));
+
+    int32x4_t v_lo_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_lo));
+    int32x4_t v_lo_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_lo_hi));
+    int32x4_t v_hi_lo = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_lo));
+    int32x4_t v_hi_hi = vreinterpretq_s32_u8(vqtbl1q_u8(v, index_hi_hi));
+
+    constexpr size_t step = ALPHA ? 4 * 16 : 3 * 16;
+
+    yuv420x_to_rgb(y0, y1, u_lo_lo, u_lo_hi, v_lo_lo, v_lo_hi, rgbx_row_0,
+                   rgbx_row_1);
+
+    yuv420x_to_rgb(y2, y3, u_hi_lo, u_hi_hi, v_hi_lo, v_hi_hi,
+                   rgbx_row_0 + step, rgbx_row_1 + step);
+  }
+
+  // Processes inputs which are not long enough to fit a vector.
+  void scalar_path(size_t length, const ScalarType *y_row_0,
+                   const ScalarType *y_row_1, const ScalarType *u_row,
+                   const ScalarType *v_row, ScalarType *rgbx_row_0,
+                   ScalarType *rgbx_row_1) {
+    const uint8_t *y_rows[2] = {y_row_0, y_row_1};
+    uint8_t *rgbx_rows[2] = {rgbx_row_0, rgbx_row_1};
+
+    int32_t u_m128 = 0, v_m128 = 0;
+    for (size_t index = 0; index < length; ++index) {
+      disable_loop_vectorization();
+
+      // There is one {U, V} pair for 4 Y values.
+      if ((index % 2) == 0) {
+        u_m128 = u_row[0] - 128;
+        v_m128 = v_row[0] - 128;
+        u_row += 1;
+        v_row += 1;
+        if (v_first_) {
+          std::swap(u_m128, v_m128);
+        }
+      }
+
+      yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows);
+    }
+  }
+};  // end of class YUVpToRGBxOrBGRx<bool, bool>
+
+using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
+using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
+using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
+using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
+
+template <typename OperationType, typename ScalarType>
+kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
+                                    const ScalarType *src, size_t src_stride,
+                                    ScalarType *dst, size_t dst_stride,
+                                    size_t width, size_t height, size_t begin,
+                                    size_t end) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
+  CHECK_IMAGE_SIZE(width, height);
+
+  // Pointer to the start of the U plane.
+  // Since `src` points to a planar YUV buffer, the Y plane comes first,
+  // occupying `src_stride * height` bytes.
+  const ScalarType *u = src + src_stride * height;
+  // Pointer to the start of the V plane.
+  // The V plane follows the U plane. Both U and V planes are
+  // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
+  // are often stored in a single contiguous chroma region in memory. Depending
+  // on image height and stride, the starting offset of V may require adjustment
+  // to maintain correct alignment. In particular, when the image height is not
+  // divisible evenly by 4, the chroma rows may not align perfectly, so a
+  // fractional offset (in rows) is applied to calculate the V plane position.
+  // The formula used here accounts for this by adjusting based on row parity,
+  // assuming consistent memory layout across the Y, U, and V planes.
+  const ScalarType *v =
+      u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
+
+  // These indices control how U and V row strides are selected across the image
+  // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
+  // two luma (Y) rows. However, when the image height is not divisible by 4,
+  // the mapping between chroma and luma rows becomes asymmetric. Specifically,
+  // when `height % 4 == 2`, the start of the V plane is offset by one chroma
+  // row relative to U.
+  //
+  // This results in U and V rows being interleaved with a phase difference,
+  // which must be accounted for during row-wise traversal. To handle this,
+  // `u_index` and `v_index` are used to alternate the stride selection
+  // independently for U and V across the loop.
+  //
+  // This mechanism ensures that memory access patterns remain correct,
+  // especially in layouts where U and V share a contiguous buffer with
+  // alternating strides. Offsetting `v_index` allows the traversal logic to
+  // maintain correct alignment and prevents misaligned or incorrect reads from
+  // the chroma buffer.
+  size_t u_index = 0;
+  size_t v_index = height % 4 == 2 ? 1 : 0;
+
+  // Compute the actual row range in the Y plane (full resolution).
+  // Since each UV row maps to 2 Y rows, we double the begin/end indices.
+  size_t row_begin = begin * 2;
+  size_t row_end = std::min<size_t>(height, end * 2);
+  size_t row_uv = begin;
+
+  // UV stepping pattern: first half of row, then padded second half.
+  // Needed to match row strides between chroma and luma components.
+  size_t uv_strides[2] = {width / 2, src_stride - width / 2};
+
+  // Calculate starting pointers for Y, U, and V planes at the given stripe
+  // start.
+  const ScalarType *y0 = src + row_begin * src_stride;
+  u = u + row_uv * src_stride / 2;
+  v = v + row_uv * src_stride / 2;
+
+  size_t dcn = operation.output_channels();
+  const size_t vsize = 16;
+  for (size_t h = row_begin; h < row_end; h += 2) {
+    ScalarType *row0 = dst + dst_stride * h;
+    ScalarType *row1 = dst + dst_stride * (h + 1);
+    const ScalarType *y1 = y0 + src_stride;
+
+    // Guard for odd-height images.
+    // If the last row in the stripe is unpaired (odd number of rows),
+    // reuse the previous row pointers to avoid out-of-bounds access.
+    if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
+      row1 = row0;
+      y1 = y0;
+    }
+
+    LoopUnroll2 loop{width, vsize};
+
+    loop.unroll_twice([&](size_t index) {
+      uint8x16_t u_vec = vld1q_u8(u + (index >> 1));
+      uint8x16_t v_vec = vld1q_u8(v + (index >> 1));
+      uint8x16_t y0_vec = vld1q_u8(y0 + index);
+      uint8x16_t y1_vec = vld1q_u8(y1 + index);
+      uint8x16_t y2_vec = vld1q_u8(y0 + index + 16);
+      uint8x16_t y3_vec = vld1q_u8(y1 + index + 16);
+
+      operation.vector_path(y0_vec, y1_vec, y2_vec, y3_vec, u_vec, v_vec,
+                            &row0[index * dcn], &row1[index * dcn]);
+    });
+
+    loop.remaining([&](size_t index, size_t length) {
+      operation.scalar_path(length - index, y0 + index, y1 + index,
+                            u + (index >> 1), v + (index >> 1),
+                            &row0[index * dcn], &row1[index * dcn]);
+    });
+
+    y0 += src_stride * 2;
+    u += uv_strides[(u_index++) & 1];
+    v += uv_strides[(v_index++) & 1];
+  }
+
+  return KLEIDICV_OK;
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin,
+                                        size_t end) {
+  YUVpToRGB operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end) {
+  YUVpToRGBA operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin,
+                                        size_t end) {
+  YUVpToBGR operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end) {
+  YUVpToBGRA operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+}  // namespace kleidicv::neon
diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_sc.h b/kleidicv/src/conversions/yuv_p_to_rgb_sc.h
new file mode 100644
index 000000000..2395a0c45
--- /dev/null
+++ b/kleidicv/src/conversions/yuv_p_to_rgb_sc.h
@@ -0,0 +1,231 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_YUV_P_TO_RGB_SC_H
+#define KLEIDICV_YUV_P_TO_RGB_SC_H
+
+#include <algorithm>
+
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/sve2.h"
+#include "yuv420_to_rgb_sc.h"
+
+namespace KLEIDICV_TARGET_NAMESPACE {
+
+template <bool BGR, bool ALPHA>
+class YUVpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, ALPHA> {
+ public:
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::yuv420x_to_rgb;
+
+  explicit YUVpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
+      : YUV420XToRGBxOrBGRx<BGR, ALPHA>(v_first) {}
+
+  // Returns the number of channels in the output image.
+  static constexpr size_t output_channels() KLEIDICV_STREAMING {
+    return ALPHA ? /* RGBA */ 4 : /* RGB */ 3;
+  }
+
+  // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and
+  // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration.
+  void vector_path(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u,
+                   svint16_t &v, uint8_t *rgbx_row_0,
+                   uint8_t *rgbx_row_1) KLEIDICV_STREAMING {
+    yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1);
+  }
+};  // end of class YUVpToRGBxOrBGRx<bool, bool>
+
+using YUVpToRGB = YUVpToRGBxOrBGRx<false, false>;
+using YUVpToRGBA = YUVpToRGBxOrBGRx<false, true>;
+using YUVpToBGR = YUVpToRGBxOrBGRx<true, false>;
+using YUVpToBGRA = YUVpToRGBxOrBGRx<true, true>;
+
+template <typename OperationType, typename ScalarType>
+kleidicv_error_t yuv2rgbx_operation(OperationType &operation,
+                                    const ScalarType *src, size_t src_stride,
+                                    ScalarType *dst, size_t dst_stride,
+                                    size_t width, size_t height, size_t begin,
+                                    size_t end) KLEIDICV_STREAMING {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, (height * 3 + 1) / 2);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
+  CHECK_IMAGE_SIZE(width, height);
+
+  // Pointer to the start of the U plane.
+  // Since `src` points to a planar YUV buffer, the Y plane comes first,
+  // occupying `src_stride * height` bytes.
+  const ScalarType *u = src + src_stride * height;
+  // Pointer to the start of the V plane.
+  // The V plane follows the U plane. Both U and V planes are
+  // subsampled at a 2:1 vertical ratio (i.e., each has height / 2 rows), and
+  // are often stored in a single contiguous chroma region in memory. Depending
+  // on image height and stride, the starting offset of V may require adjustment
+  // to maintain correct alignment. In particular, when the image height is not
+  // divisible evenly by 4, the chroma rows may not align perfectly, so a
+  // fractional offset (in rows) is applied to calculate the V plane position.
+  // The formula used here accounts for this by adjusting based on row parity,
+  // assuming consistent memory layout across the Y, U, and V planes.
+  const ScalarType *v =
+      u + src_stride * (height / 4) + (width / 2) * ((height % 4) / 2);
+
+  // These indices control how U and V row strides are selected across the image
+  // height. In planar YUV 4:2:0 format, each chroma row (U/V) corresponds to
+  // two luma (Y) rows. However, when the image height is not divisible by 4,
+  // the mapping between chroma and luma rows becomes asymmetric. Specifically,
+  // when `height % 4 == 2`, the start of the V plane is offset by one chroma
+  // row relative to U.
+  //
+  // This results in U and V rows being interleaved with a phase difference,
+  // which must be accounted for during row-wise traversal. To handle this,
+  // `u_index` and `v_index` are used to alternate the stride selection
+  // independently for U and V across the loop.
+  //
+  // This mechanism ensures that memory access patterns remain correct,
+  // especially in layouts where U and V share a contiguous buffer with
+  // alternating strides. Offsetting `v_index` allows the traversal logic to
+  // maintain correct alignment and prevents misaligned or incorrect reads from
+  // the chroma buffer.
+  size_t u_index = 0;
+  size_t v_index = height % 4 == 2 ? 1 : 0;
+
+  // Compute the actual row range in the Y plane (full resolution).
+  // Since each UV row maps to 2 Y rows, we double the begin/end indices.
+  size_t row_begin = begin * 2;
+  size_t row_end = std::min<size_t>(height, end * 2);
+  size_t row_uv = begin;
+
+  // UV stepping pattern: first half of row, then padded second half.
+  // Needed to match row strides between chroma and luma components.
+  size_t uv_strides[2] = {width / 2, src_stride - width / 2};
+
+  // Calculate starting pointers for Y, U, and V planes at the given stripe
+  // start.
+  const ScalarType *y0 = src + row_begin * src_stride;
+  u = u + row_uv * src_stride / 2;
+  v = v + row_uv * src_stride / 2;
+
+  size_t dcn = operation.output_channels();
+  const size_t vsize = svcntb();
+  for (size_t h = row_begin; h < row_end; h += 2) {
+    ScalarType *row0 = dst + dst_stride * h;
+    ScalarType *row1 = dst + dst_stride * (h + 1);
+    const ScalarType *y1 = y0 + src_stride;
+
+    // Guard for odd-height images.
+    // If the last row in the stripe is unpaired (odd number of rows),
+    // reuse the previous row pointers to avoid out-of-bounds access.
+    if (KLEIDICV_UNLIKELY(h == (row_end - 1))) {
+      row1 = row0;
+      y1 = y0;
+    }
+
+    LoopUnroll2 loop{width, svcntb()};
+
+    loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING {
+      svbool_t pg = svptrue_b8();
+      svuint8_t u8_vec = svld1(pg, u + (index >> 1));
+      svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
+      svint16_t u_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(u8_vec));
+
+      svuint8_t v8_vec = svld1(pg, v + (index >> 1));
+      svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
+      svint16_t v_vec_hi = svreinterpret_s16_u16(svunpkhi_u16(v8_vec));
+
+      svuint8_t y0_vec = svld1(pg, y0 + index);
+      svuint8_t y1_vec = svld1(pg, y1 + index);
+      svuint8_t y2_vec = svld1(pg, y0 + index + vsize);
+      svuint8_t y3_vec = svld1(pg, y1 + index + vsize);
+
+      operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
+                            &row0[index * dcn], &row1[index * dcn]);
+
+      operation.vector_path(pg, y2_vec, y3_vec, u_vec_hi, v_vec_hi,
+                            &row0[(index + vsize) * dcn],
+                            &row1[(index + vsize) * dcn]);
+    });
+
+    loop.unroll_once([&](size_t index) KLEIDICV_STREAMING {
+      svbool_t pg = svptrue_b8();
+      svbool_t pg_half = svptrue_b8();
+      svuint8_t u8_vec = svld1(pg_half, u + (index >> 1));
+      svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
+
+      svuint8_t v8_vec = svld1(pg, v + (index >> 1));
+      svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
+
+      svuint8_t y0_vec = svld1(pg, y0 + index);
+      svuint8_t y1_vec = svld1(pg, y1 + index);
+
+      operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
+                            &row0[index * dcn], &row1[index * dcn]);
+    });
+
+    loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING {
+      size_t min_width = length - index;
+      size_t half_min_width = (min_width + 1) / 2;
+      svbool_t pg = svwhilelt_b8(int64_t(0), static_cast<int64_t>(min_width));
+      svbool_t pg_half =
+          svwhilelt_b8(int64_t(0), static_cast<int64_t>(half_min_width));
+      svuint8_t u8_vec = svld1(pg_half, u + (index >> 1));
+      svint16_t u_vec_lo = svreinterpret_s16_u16(svunpklo_u16(u8_vec));
+
+      svuint8_t v8_vec = svld1(pg_half, v + (index >> 1));
+      svint16_t v_vec_lo = svreinterpret_s16_u16(svunpklo_u16(v8_vec));
+
+      svuint8_t y0_vec = svld1(pg, y0 + index);
+      svuint8_t y1_vec = svld1(pg, y1 + index);
+
+      operation.vector_path(pg, y0_vec, y1_vec, u_vec_lo, v_vec_lo,
+                            &row0[index * dcn], &row1[index * dcn]);
+    });
+
+    y0 += src_stride * 2;
+    u += uv_strides[(u_index++) & 1];
+    v += uv_strides[(v_index++) & 1];
+  }
+
+  return KLEIDICV_OK;
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+static kleidicv_error_t yuv_p_to_rgb_stripe_u8_sc(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, size_t begin,
+    size_t end) KLEIDICV_STREAMING {
+  YUVpToRGB operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+static kleidicv_error_t yuv_p_to_rgba_stripe_u8_sc(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, size_t begin,
+    size_t end) KLEIDICV_STREAMING {
+  YUVpToRGBA operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+static kleidicv_error_t yuv_p_to_bgr_stripe_u8_sc(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, size_t begin,
+    size_t end) KLEIDICV_STREAMING {
+  YUVpToBGR operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+static kleidicv_error_t yuv_p_to_bgra_stripe_u8_sc(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, size_t begin,
+    size_t end) KLEIDICV_STREAMING {
+  YUVpToBGRA operation{v_first};
+  return yuv2rgbx_operation(operation, src, src_stride, dst, dst_stride, width,
+                            height, begin, end);
+}
+}  // namespace KLEIDICV_TARGET_NAMESPACE
+
+#endif  // KLEIDICV_YUV_P_TO_RGB_SC_H
diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp
new file mode 100644
index 000000000..436191745
--- /dev/null
+++ b/kleidicv/src/conversions/yuv_p_to_rgb_sme.cpp
@@ -0,0 +1,40 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "yuv_p_to_rgb_sc.h"
+
+namespace kleidicv::sme {
+
+KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
+yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       bool v_first, size_t begin, size_t end) {
+  return yuv_p_to_rgb_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                   height, v_first, begin, end);
+}
+
+KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
+yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst,
+                        size_t dst_stride, size_t width, size_t height,
+                        bool v_first, size_t begin, size_t end) {
+  return yuv_p_to_rgba_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                    height, v_first, begin, end);
+}
+
+KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
+yuv_p_to_bgr_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       bool v_first, size_t begin, size_t end) {
+  return yuv_p_to_bgr_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                   height, v_first, begin, end);
+}
+
+KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
+yuv_p_to_bgra_stripe_u8(const uint8_t *src, size_t src_stride, uint8_t *dst,
+                        size_t dst_stride, size_t width, size_t height,
+                        bool v_first, size_t begin, size_t end) {
+  return yuv_p_to_bgra_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                    height, v_first, begin, end);
+}
+}  // namespace kleidicv::sme
diff --git a/kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp b/kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp
new file mode 100644
index 000000000..0bebc99e4
--- /dev/null
+++ b/kleidicv/src/conversions/yuv_p_to_rgb_sve2.cpp
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "yuv_p_to_rgb_sc.h"
+
+namespace kleidicv::sve2 {
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t yuv_p_to_rgb_stripe_u8(const uint8_t *src, size_t src_stride,
+                                        uint8_t *dst, size_t dst_stride,
+                                        size_t width, size_t height,
+                                        bool v_first, size_t begin,
+                                        size_t end) {
+  return yuv_p_to_rgb_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                   height, v_first, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t yuv_p_to_rgba_stripe_u8(const uint8_t *src, size_t src_stride,
+                                         uint8_t *dst, size_t dst_stride,
+                                         size_t width, size_t height,
+                                         bool v_first, size_t begin,
+                                         size_t end) {
+  return yuv_p_to_rgba_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                    height, v_first, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t yuv_p_to_bgr_stripe_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, size_t begin, size_t end) {
+  return yuv_p_to_bgr_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                   height, v_first, begin, end);
+}
+
+KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t yuv_p_to_bgra_stripe_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, size_t begin, size_t end) {
+  return yuv_p_to_bgra_stripe_u8_sc(src, src_stride, dst, dst_stride, width,
+                                    height, v_first, begin, end);
+}
+
+}  // namespace kleidicv::sve2
diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp b/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp
index 098f5ce87..e790f74ac 100644
--- a/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp
+++ b/kleidicv/src/conversions/yuv_sp_to_rgb_api.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "kleidicv/conversions/yuv_sp_to_rgb.h"
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
 #include "kleidicv/dispatch.h"
 #include "kleidicv/kleidicv.h"
 
diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp
index 1deacceae..b0ff1e1ab 100644
--- a/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp
+++ b/kleidicv/src/conversions/yuv_sp_to_rgb_neon.cpp
@@ -4,78 +4,31 @@
 
 #include <utility>
 
-#include "kleidicv/conversions/yuv_sp_to_rgb.h"
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
+#include "yuv420_to_rgb_neon.h"
 
 namespace kleidicv::neon {
-
 template <bool BGR, bool ALPHA>
-class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop {
+class YUVSpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, ALPHA>,
+                                public UnrollOnce,
+                                public TryToAvoidTailLoop {
  public:
   using VecTraits = neon::VecTraits<uint8_t>;
   using ScalarType = VecTraits::ScalarType;
   using VectorType = VecTraits::VectorType;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::de_interleave_indices_;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::yuv420x_to_rgb;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::v_first_;
 
-  explicit YUVSpToRGBxOrBGRx(bool is_nv21)
-      : y_weight_{vdupq_n_s32(kYWeight)},
-        uv_weights_{vld2_s32(kUVWeights)},
-        // Both the rounding shift right constant and the -128 value are
-        // included.
-        r_base_{vdupq_n_s32(static_cast<int32_t>(1 << (kWeightScale - 1)) -
-                            128 * kUVWeights[kRVWeightIndex])},
-        g_base_{vdupq_n_s32(static_cast<int32_t>(1 << (kWeightScale - 1)) -
-                            128 * (kUVWeights[1] + kUVWeights[2]))},
-        b_base_{vdupq_n_s32(static_cast<int32_t>(1 << (kWeightScale - 1)) -
-                            128 * kUVWeights[3])},
-        de_interleave_indices_{},
-        is_nv21_(is_nv21) {
-    neon::VecTraits<int8_t>::load(kDeInterleaveTableIndices,
-                                  de_interleave_indices_);
-  }
-
-  // Returns the number of channels in the output image.
-  static constexpr size_t output_channels() {
-    return ALPHA ? /* RGBA */ 4 : /* RGB */ 3;
-  }
+  explicit YUVSpToRGBxOrBGRx(bool v_first)
+      : YUV420XToRGBxOrBGRx<BGR, ALPHA>(v_first) {}
 
   // Processes 2 * 16 bytes (even and odd rows) of the input YUV data, and
   // outputs 2 * 3 (or 4) * 16 bytes of RGB (or RGBA) data per loop iteration.
   void vector_path(VectorType y0, VectorType y1, VectorType uv,
                    ScalarType *rgbx_row_0, ScalarType *rgbx_row_1) {
-    // Y' = saturating(Ya - 16) and widen to 32-bits.
-    uint8x16_t y0_m16 = vqsubq_u8(y0, vdupq_n_u8(16));
-    uint8x16_t y1_m16 = vqsubq_u8(y1, vdupq_n_u8(16));
-
-    uint32x4_t y0_m16_even_l =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[0]));
-    uint32x4_t y0_m16_even_h =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[1]));
-    uint32x4_t y0_m16_odd_l =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[2]));
-    uint32x4_t y0_m16_odd_h =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y0_m16, de_interleave_indices_.val[3]));
-
-    uint32x4_t y1_m16_even_l =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[0]));
-    uint32x4_t y1_m16_even_h =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[1]));
-    uint32x4_t y1_m16_odd_l =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[2]));
-    uint32x4_t y1_m16_odd_h =
-        vreinterpretq_u32_u8(vqtbl1q_s8(y1_m16, de_interleave_indices_.val[3]));
-
-    // Y = Weight(Y) * Y'
-    y0_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_l), y_weight_);
-    y0_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_even_h), y_weight_);
-    y0_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_l), y_weight_);
-    y0_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y0_m16_odd_h), y_weight_);
-
-    y1_m16_even_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_l), y_weight_);
-    y1_m16_even_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_even_h), y_weight_);
-    y1_m16_odd_l = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_l), y_weight_);
-    y1_m16_odd_h = vmulq_s32(vreinterpretq_u32_s32(y1_m16_odd_h), y_weight_);
-
     // Widen U and V to 32 bits.
     int32x4_t u_l = vqtbl1q_s8(uv, de_interleave_indices_.val[0]);
     int32x4_t u_h = vqtbl1q_s8(uv, de_interleave_indices_.val[1]);
@@ -83,128 +36,7 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop {
     int32x4_t v_l = vqtbl1q_s8(uv, de_interleave_indices_.val[2]);
     int32x4_t v_h = vqtbl1q_s8(uv, de_interleave_indices_.val[3]);
 
-    // Swap U and V channels for NV21 (order is V, U).
-    if (is_nv21_) {
-      std::swap(u_l, v_l);
-      std::swap(u_h, v_h);
-    }
-
-    // R - Y = Rbase + Weight(RV) * V =
-    //         Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
-    int32x4_t r_sub_y_l = vmlaq_lane_s32(r_base_, v_l, uv_weights_.val[0], 0);
-    int32x4_t r_sub_y_h = vmlaq_lane_s32(r_base_, v_h, uv_weights_.val[0], 0);
-
-    // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
-    //         Weight(GU) * ((1 << (SCALE - 1)) - 128) +
-    //         Weight(GV) * ((1 << (SCALE - 1)) - 128) +
-    //         Weight(GU) * U + Weight(GV) * V
-    int32x4_t g_sub_y_l = vmlaq_lane_s32(g_base_, u_l, uv_weights_.val[1], 0);
-    int32x4_t g_sub_y_h = vmlaq_lane_s32(g_base_, u_h, uv_weights_.val[1], 0);
-    g_sub_y_l = vmlaq_lane_s32(g_sub_y_l, v_l, uv_weights_.val[0], 1);
-    g_sub_y_h = vmlaq_lane_s32(g_sub_y_h, v_h, uv_weights_.val[0], 1);
-
-    // B - Y = Bbase + Weight(BU) * U =
-    //         Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
-    int32x4_t b_sub_y_l = vmlaq_lane_s32(b_base_, u_l, uv_weights_.val[1], 1);
-    int32x4_t b_sub_y_h = vmlaq_lane_s32(b_base_, u_h, uv_weights_.val[1], 1);
-
-    // R = (R - Y) + Y
-    int32x4_t r0_even_l = vaddq_s32(r_sub_y_l, y0_m16_even_l);
-    int32x4_t r0_even_h = vaddq_s32(r_sub_y_h, y0_m16_even_h);
-    int32x4_t r0_odd_l = vaddq_s32(r_sub_y_l, y0_m16_odd_l);
-    int32x4_t r0_odd_h = vaddq_s32(r_sub_y_h, y0_m16_odd_h);
-    int16x8_t r0_even = combine_scaled_s16(r0_even_l, r0_even_h);
-    int16x8_t r0_odd = combine_scaled_s16(r0_odd_l, r0_odd_h);
-
-    int32x4_t r1_even_l = vaddq_s32(r_sub_y_l, y1_m16_even_l);
-    int32x4_t r1_even_h = vaddq_s32(r_sub_y_h, y1_m16_even_h);
-    int32x4_t r1_odd_l = vaddq_s32(r_sub_y_l, y1_m16_odd_l);
-    int32x4_t r1_odd_h = vaddq_s32(r_sub_y_h, y1_m16_odd_h);
-    int16x8_t r1_even = combine_scaled_s16(r1_even_l, r1_even_h);
-    int16x8_t r1_odd = combine_scaled_s16(r1_odd_l, r1_odd_h);
-
-    // G = (G - Y) + Y
-    int32x4_t g0_even_l = vaddq_s32(g_sub_y_l, y0_m16_even_l);
-    int32x4_t g0_even_h = vaddq_s32(g_sub_y_h, y0_m16_even_h);
-    int32x4_t g0_odd_l = vaddq_s32(g_sub_y_l, y0_m16_odd_l);
-    int32x4_t g0_odd_h = vaddq_s32(g_sub_y_h, y0_m16_odd_h);
-    int16x8_t g0_even = combine_scaled_s16(g0_even_l, g0_even_h);
-    int16x8_t g0_odd = combine_scaled_s16(g0_odd_l, g0_odd_h);
-
-    int32x4_t g1_even_l = vaddq_s32(g_sub_y_l, y1_m16_even_l);
-    int32x4_t g1_even_h = vaddq_s32(g_sub_y_h, y1_m16_even_h);
-    int32x4_t g1_odd_l = vaddq_s32(g_sub_y_l, y1_m16_odd_l);
-    int32x4_t g1_odd_h = vaddq_s32(g_sub_y_h, y1_m16_odd_h);
-    int16x8_t g1_even = combine_scaled_s16(g1_even_l, g1_even_h);
-    int16x8_t g1_odd = combine_scaled_s16(g1_odd_l, g1_odd_h);
-
-    // B = (B - Y) + Y
-    int32x4_t b0_even_l = vaddq_s32(b_sub_y_l, y0_m16_even_l);
-    int32x4_t b0_even_h = vaddq_s32(b_sub_y_h, y0_m16_even_h);
-    int32x4_t b0_odd_l = vaddq_s32(b_sub_y_l, y0_m16_odd_l);
-    int32x4_t b0_odd_h = vaddq_s32(b_sub_y_h, y0_m16_odd_h);
-    int16x8_t b0_even = combine_scaled_s16(b0_even_l, b0_even_h);
-    int16x8_t b0_odd = combine_scaled_s16(b0_odd_l, b0_odd_h);
-
-    int32x4_t b1_even_l = vaddq_s32(b_sub_y_l, y1_m16_even_l);
-    int32x4_t b1_even_h = vaddq_s32(b_sub_y_h, y1_m16_even_h);
-    int32x4_t b1_odd_l = vaddq_s32(b_sub_y_l, y1_m16_odd_l);
-    int32x4_t b1_odd_h = vaddq_s32(b_sub_y_h, y1_m16_odd_h);
-    int16x8_t b1_even = combine_scaled_s16(b1_even_l, b1_even_h);
-    int16x8_t b1_odd = combine_scaled_s16(b1_odd_l, b1_odd_h);
-
-    // Zip even and odd RGB pixels.
-    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
-    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
-    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
-    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
-    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
-    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
-
-    if constexpr (ALPHA) {
-      uint8x16x4_t rgba0, rgba1;
-      // Red channel
-      rgba0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
-      rgba1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
-      // Green channel
-      rgba0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
-      rgba1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
-      // Blue channel
-      rgba0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
-      rgba1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
-      // Alpha channel
-      rgba0.val[3] = vdupq_n_u8(0xFF);
-      rgba1.val[3] = vdupq_n_u8(0xFF);
-
-      if constexpr (BGR) {
-        std::swap(rgba0.val[0], rgba0.val[2]);
-        std::swap(rgba1.val[0], rgba1.val[2]);
-      }
-
-      // Store RGB pixels to memory.
-      vst4q_u8(rgbx_row_0, rgba0);
-      vst4q_u8(rgbx_row_1, rgba1);
-    } else {
-      uint8x16x3_t rgb0, rgb1;
-      // Red channel
-      rgb0.val[0] = vcombine_u8(r0.val[0], r0.val[1]);
-      rgb1.val[0] = vcombine_u8(r1.val[0], r1.val[1]);
-      // Green channel
-      rgb0.val[1] = vcombine_u8(g0.val[0], g0.val[1]);
-      rgb1.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
-      // Blue channel
-      rgb0.val[2] = vcombine_u8(b0.val[0], b0.val[1]);
-      rgb1.val[2] = vcombine_u8(b1.val[0], b1.val[1]);
-
-      if constexpr (BGR) {
-        std::swap(rgb0.val[0], rgb0.val[2]);
-        std::swap(rgb1.val[0], rgb1.val[2]);
-      }
-
-      // Store RGB pixels to memory.
-      vst3q_u8(rgbx_row_0, rgb0);
-      vst3q_u8(rgbx_row_1, rgb1);
-    }
+    yuv420x_to_rgb(y0, y1, u_l, u_h, v_l, v_h, rgbx_row_0, rgbx_row_1);
   }
 
   // Processes inputs which are not long enough to fit a vector.
@@ -223,66 +55,14 @@ class YUVSpToRGBxOrBGRx final : public UnrollOnce, public TryToAvoidTailLoop {
         u_m128 = uv_row[0] - 128;
         v_m128 = uv_row[1] - 128;
         uv_row += 2;
-        if (is_nv21_) {
+        if (v_first_) {
           std::swap(u_m128, v_m128);
         }
       }
 
-      for (size_t selector = 0; selector < 2; ++selector) {
-        int32_t y = kYWeight * std::max(y_rows[selector][index] - 16, 0);
-        int32_t r = y + kUVWeights[kRVWeightIndex] * v_m128;
-        int32_t g = y + kUVWeights[kGUWeightIndex] * u_m128 +
-                    kUVWeights[kGVWeightIndex] * v_m128;
-        int32_t b = y + kUVWeights[kBUWeightIndex] * u_m128;
-
-        r = rounding_shift_right(r, kWeightScale);
-        g = rounding_shift_right(g, kWeightScale);
-        b = rounding_shift_right(b, kWeightScale);
-
-        if constexpr (BGR) {
-          std::swap(r, b);
-        }
-
-        rgbx_rows[selector][0] = saturating_cast<int32_t, uint8_t>(r);
-        rgbx_rows[selector][1] = saturating_cast<int32_t, uint8_t>(g);
-        rgbx_rows[selector][2] = saturating_cast<int32_t, uint8_t>(b);
-        if constexpr (ALPHA) {
-          rgbx_rows[selector][3] = 0xFF;
-        }
-
-        rgbx_rows[selector] += ALPHA ? /* RGBA */ 4 : /* RGB */ 3;
-      }
+      yuv420x_to_rgb(y_rows, index, u_m128, v_m128, rgbx_rows);
     }
   }
-
- private:
-  static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
-    return vcombine_s16(vmovn_s32(vshrq_n_s32(a, kWeightScale)),
-                        vmovn_s32(vshrq_n_s32(b, kWeightScale)));
-  }
-
-  int32x4_t y_weight_;
-  int32x2x2_t uv_weights_;
-  int32x4_t r_base_;
-  int32x4_t g_base_;
-  int32x4_t b_base_;
-  int8x16x4_t de_interleave_indices_;
-
-  const bool is_nv21_;
-  // clang-format off
-
-  static constexpr int8_t kDeInterleaveTableIndices[64] = {
-      /* low and even */
-      0, -1, -1, -1,  2, -1, -1, -1,  4, -1, -1, -1,  6, -1, -1, -1,
-      /* high and even */
-      8, -1, -1, -1, 10, -1, -1, -1, 12, -1, -1, -1, 14, -1, -1, -1,
-      /* low and odd */
-      1, -1, -1, -1,  3, -1, -1, -1,  5, -1, -1, -1,  7, -1, -1, -1,
-      /* high and odd */
-      9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
-  };
-
-  // clang-format on
 };  // end of class YUVSpToRGBxOrBGRx<bool, bool>
 
 using YUVSpToRGB = YUVSpToRGBxOrBGRx<false, false>;
@@ -296,7 +76,7 @@ kleidicv_error_t yuv2rgbx_operation(
     const ScalarType *src_uv, size_t src_uv_stride, ScalarType *dst,
     size_t dst_stride, size_t width, size_t height) {
   CHECK_POINTER_AND_STRIDE(src_y, src_y_stride, height);
-  CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, height);
+  CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, (height + 1) / 2);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
   CHECK_IMAGE_SIZE(width, height);
 
diff --git a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h
index 201787117..9254d8faa 100644
--- a/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h
+++ b/kleidicv/src/conversions/yuv_sp_to_rgb_sc.h
@@ -5,20 +5,22 @@
 #ifndef KLEIDICV_YUV_SP_TO_RGB_SC_H
 #define KLEIDICV_YUV_SP_TO_RGB_SC_H
 
-#include "kleidicv/conversions/yuv_sp_to_rgb.h"
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/sve2.h"
+#include "yuv420_to_rgb_sc.h"
 
 namespace KLEIDICV_TARGET_NAMESPACE {
 
 template <bool BGR, bool ALPHA>
-class YUVSpToRGBxOrBGRx final {
+class YUVSpToRGBxOrBGRx final : public YUV420XToRGBxOrBGRx<BGR, ALPHA> {
  public:
   using ContextType = Context;
   using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<uint8_t>;
+  using YUV420XToRGBxOrBGRx<BGR, ALPHA>::yuv420x_to_rgb;
 
-  explicit YUVSpToRGBxOrBGRx(bool is_nv21) KLEIDICV_STREAMING
-      : is_nv21_(is_nv21) {}
+  explicit YUVSpToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
+      : YUV420XToRGBxOrBGRx<BGR, ALPHA>(v_first) {}
 
   // Returns the number of channels in the output image.
   static constexpr size_t output_channels() KLEIDICV_STREAMING {
@@ -32,152 +34,15 @@ class YUVSpToRGBxOrBGRx final {
                    uint8_t *rgbx_row_0,
                    uint8_t *rgbx_row_1) KLEIDICV_STREAMING {
     auto pg = ctx.predicate();
-
-    // Both the rounding shift right constant and the -128 value are included.
-    constexpr int32_t kOffset = 1 << (kWeightScale - 1);
-    svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]);
-    svint32_t g_base =
-        svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2]));
-    svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]);
-
     // Load channels: y0 and y1 are two adjacent rows.
     svuint8_t y0 = svld1(pg, y_row_0);
     svuint8_t y1 = svld1(pg, y_row_1);
     svuint8_t uv = svld1(pg, uv_row);
-
-    // Y' = saturating(Ya - 16) and widen to signed 32-bits.
-    svuint8_t y0_m16 = svqsub(y0, static_cast<uint8_t>(16));
-    svuint16_t y0_m16_b = svmovlb(y0_m16);  // 'b' means bottom
-    svuint16_t y0_m16_t = svmovlt(y0_m16);  // 't' means top
-    svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b));
-    svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b));
-    svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t));
-    svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t));
-
-    svuint8_t y1_m16 = svqsub(y1, static_cast<uint8_t>(16));
-    svuint16_t y1_m16_b = svmovlb(y1_m16);
-    svuint16_t y1_m16_t = svmovlt(y1_m16);
-    svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b));
-    svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b));
-    svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t));
-    svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t));
-
-    // Y = Weight(Y) * Y'
-    y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight);
-    y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight);
-    y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight);
-    y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight);
-
-    y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight);
-    y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight);
-    y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight);
-    y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight);
-
     // Widen U and V to 32 bits.
     svint16_t u = svreinterpret_s16(svmovlb(uv));
     svint16_t v = svreinterpret_s16(svmovlt(uv));
-
-    if (is_nv21_) {
-      // Swap U and V channels for NV21 (order is V, U).
-      swap_scalable(u, v);
-    }
-
-    svint32_t u_b = svmovlb(u);
-    svint32_t u_t = svmovlt(u);
-    svint32_t v_b = svmovlb(v);
-    svint32_t v_t = svmovlt(v);
-
-    // R - Y = Rbase + Weight(RV) * V =
-    //         Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
-    svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]);
-    svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]);
-
-    // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
-    //         Weight(GU) * ((1 << (SCALE - 1)) - 128) +
-    //         Weight(GV) * ((1 << (SCALE - 1)) - 128) +
-    //         Weight(GU) * U + Weight(GV) * V
-    svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]);
-    svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]);
-    g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]);
-    g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]);
-
-    // B - Y = Bbase + Weight(BU) * U =
-    //         Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
-    svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]);
-    svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]);
-
-    // R = (R - Y) + Y
-    // FIXME: There are too many instructions here.
-    // Is there a better way to do this?
-    svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb);
-    r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt);
-    r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16);
-    svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb);
-    r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt);
-    r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16);
-    svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t);
-
-    svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb);
-    r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt);
-    r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16);
-    svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb);
-    r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt);
-    r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16);
-    svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t);
-
-    // G = (G - Y) + Y
-    svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb);
-    g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt);
-    g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16);
-    svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb);
-    g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt);
-    g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16);
-    svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t);
-
-    svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb);
-    g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt);
-    g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16);
-    svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb);
-    g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt);
-    g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16);
-    svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t);
-
-    // B = (B - Y) + Y
-    svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb);
-    b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt);
-    b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16);
-    svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb);
-    b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt);
-    b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16);
-    svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t);
-
-    svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb);
-    b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt);
-    b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16);
-    svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb);
-    b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt);
-    b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16);
-    svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t);
-
-    if constexpr (ALPHA) {
-      svuint8x4_t rgba0 =
-          svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF));
-      svuint8x4_t rgba1 =
-          svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF));
-      // Store RGBA pixels to memory.
-      svst4_u8(pg, rgbx_row_0, rgba0);
-      svst4_u8(pg, rgbx_row_1, rgba1);
-    } else {
-      svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0);
-      svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1);
-      // Store RGB pixels to memory.
-      svst3(pg, rgbx_row_0, rgb0);
-      svst3(pg, rgbx_row_1, rgb1);
-    }
+    yuv420x_to_rgb(pg, y0, y1, u, v, rgbx_row_0, rgbx_row_1);
   }
-
- private:
-  const bool is_nv21_;
 };  // end of class YUVSpToRGBxOrBGRx<bool, bool>
 
 using YUVSpToRGB = YUVSpToRGBxOrBGRx<false, false>;
@@ -191,7 +56,7 @@ kleidicv_error_t yuv2rgbx_operation(
     const ScalarType *src_uv, size_t src_uv_stride, ScalarType *dst,
     size_t dst_stride, size_t width, size_t height) KLEIDICV_STREAMING {
   CHECK_POINTER_AND_STRIDE(src_y, src_y_stride, height);
-  CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, height);
+  CHECK_POINTER_AND_STRIDE(src_uv, src_uv_stride, (height + 1) / 2);
   CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
   CHECK_IMAGE_SIZE(width, height);
 
diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
index 2a6bcebe8..df51c3068 100644
--- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
+++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
@@ -76,6 +76,38 @@ KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_f32_to_u8, float, uint8_t);
 KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_s8_to_f32, int8_t, float);
 KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_u8_to_f32, uint8_t, float);
 
+/// Internal - not part of the public API and its direct use is not supported.
+///
+/// Multithreaded implementation of kleidicv_yuv_p_to_bgr_u8 - see the
+/// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, kleidicv_thread_multithreading);
+
+/// Internal - not part of the public API and its direct use is not supported.
+///
+/// Multithreaded implementation of kleidicv_yuv_p_to_bgra_u8 - see the
+/// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, kleidicv_thread_multithreading);
+
+/// Internal - not part of the public API and its direct use is not supported.
+///
+/// Multithreaded implementation of kleidicv_yuv_p_to_rgb_u8 - see the
+/// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, kleidicv_thread_multithreading);
+
+/// Internal - not part of the public API and its direct use is not supported.
+///
+/// Multithreaded implementation of kleidicv_yuv_p_to_rgba_u8 - see the
+/// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first, kleidicv_thread_multithreading);
+
 /// Internal - not part of the public API and its direct use is not supported.
 ///
 /// Multithreaded implementation of kleidicv_yuv_sp_to_bgr_u8 - see the
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index aef62654e..875091a99 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -13,6 +13,7 @@
 
 #include "kleidicv/arithmetics/rotate.h"
 #include "kleidicv/arithmetics/scale.h"
+#include "kleidicv/conversions/yuv_420_to_rgb.h"
 #include "kleidicv/ctypes.h"
 #include "kleidicv/filters/blur_and_downsample.h"
 #include "kleidicv/filters/gaussian_blur.h"
@@ -250,6 +251,54 @@ kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride,
   return parallel_batches(callback, mt, width, 64);
 }
 
+kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first,
+    kleidicv_thread_multithreading mt) {
+  auto callback = [=](unsigned begin, unsigned end) {
+    return kleidicv_yuv_p_to_bgr_stripe_u8(
+        src, src_stride, dst, dst_stride, width, height, v_first,
+        static_cast<size_t>(begin), static_cast<size_t>(end));
+  };
+  return parallel_batches(callback, mt, (height + 1) / 2);
+}
+
+kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first,
+    kleidicv_thread_multithreading mt) {
+  auto callback = [=](unsigned begin, unsigned end) {
+    return kleidicv_yuv_p_to_bgra_stripe_u8(
+        src, src_stride, dst, dst_stride, width, height, v_first,
+        static_cast<size_t>(begin), static_cast<size_t>(end));
+  };
+  return parallel_batches(callback, mt, (height + 1) / 2);
+}
+
+kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first,
+    kleidicv_thread_multithreading mt) {
+  auto callback = [=](unsigned begin, unsigned end) {
+    return kleidicv_yuv_p_to_rgb_stripe_u8(
+        src, src_stride, dst, dst_stride, width, height, v_first,
+        static_cast<size_t>(begin), static_cast<size_t>(end));
+  };
+  return parallel_batches(callback, mt, (height + 1) / 2);
+}
+
+kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, bool v_first,
+    kleidicv_thread_multithreading mt) {
+  auto callback = [=](unsigned begin, unsigned end) {
+    return kleidicv_yuv_p_to_rgba_stripe_u8(
+        src, src_stride, dst, dst_stride, width, height, v_first,
+        static_cast<size_t>(begin), static_cast<size_t>(end));
+  };
+  return parallel_batches(callback, mt, (height + 1) / 2);
+}
+
 template <typename F>
 inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl(
     F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt
index 4001aef9a..1606bc649 100755
--- a/scripts/benchmark/benchmarks.txt
+++ b/scripts/benchmark/benchmarks.txt
@@ -16,6 +16,10 @@ YUVSP2BGR:  opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2B
 YUVSP2BGRA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2BGRA_NV12)'
 YUVSP2RGB:  opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGB_NV12)'
 YUVSP2RGBA: opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGBA_NV12)'
+YUVP2BGR:   opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2BGR_YV12)'
+YUVP2BGRA:  opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2BGRA_YV12)'
+YUVP2RGB:   opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGB_YV12)'
+YUVP2RGBA:  opencv_perf_imgproc '*cvtColorYUV420/*' '($PIXEL_FORMAT, COLOR_YUV2RGBA_YV12)'
 
 RGB2YUV:    opencv_perf_imgproc '*cvtColor8u/*'     '($PIXEL_FORMAT, COLOR_RGB2YUV)'
 BGR2YUV:    opencv_perf_imgproc '*cvtColor8u/*'     '($PIXEL_FORMAT, COLOR_BGR2YUV)'
diff --git a/test/api/test_thread_yuv_p_to_rgb.cpp b/test/api/test_thread_yuv_p_to_rgb.cpp
new file mode 100644
index 000000000..58bc02822
--- /dev/null
+++ b/test/api/test_thread_yuv_p_to_rgb.cpp
@@ -0,0 +1,105 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <thread>
+
+#include "framework/array.h"
+#include "framework/generator.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv_thread/kleidicv_thread.h"
+#include "multithreading_fake.h"
+
+// Tuple of width, height, thread count.
+typedef std::tuple<unsigned, unsigned, unsigned> P;
+
+class YuvpThread : public testing::TestWithParam<P> {
+ public:
+  template <typename SingleThreadedFunc, typename MultithreadedFunc>
+  void check(SingleThreadedFunc single_threaded_func,
+             MultithreadedFunc multithreaded_func, size_t channels) {
+    unsigned width = 0, height = 0, thread_count = 0;
+    std::tie(width, height, thread_count) = GetParam();
+    test::Array2D<uint8_t> src(width, (height * 3 + 1) / 2),
+        dst_single(size_t{width} * channels, height),
+        dst_multi(size_t{width} * channels, height);
+
+    test::PseudoRandomNumberGenerator<uint8_t> generator;
+    src.fill(generator);
+
+    kleidicv_error_t single_result =
+        single_threaded_func(src.data(), src.stride(), dst_single.data(),
+                             dst_single.stride(), width, height, false);
+
+    kleidicv_error_t multi_result = multithreaded_func(
+        src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width,
+        height, false, get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_OK, single_result);
+    EXPECT_EQ(KLEIDICV_OK, multi_result);
+    EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+  }
+
+  template <typename Func>
+  void run_unsupported(Func impl, size_t channels, bool is_nv21) {
+    test::Array2D<uint8_t> src{20, (10 * 3 + 1) / 2};
+
+    test::Array2D<uint8_t> dst{20 * channels, 10, 0, channels};
+
+    test::test_null_args(impl, src.data(), src.stride(), dst.data(),
+                         dst.stride(), dst.width(), dst.height(), is_nv21,
+                         get_multithreading_fake(2));
+
+    EXPECT_EQ(KLEIDICV_OK,
+              impl(src.data(), src.stride(), dst.data(), dst.stride(), 0, 1,
+                   is_nv21, get_multithreading_fake(2)));
+
+    EXPECT_EQ(KLEIDICV_OK,
+              impl(src.data(), src.stride(), dst.data(), dst.stride(), 1, 0,
+                   is_nv21, get_multithreading_fake(2)));
+
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src.data(), src.stride(), dst.data(), dst.stride(),
+                   KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, is_nv21,
+                   get_multithreading_fake(2)));
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src.data(), src.stride(), dst.data(), dst.stride(),
+                   KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS,
+                   is_nv21, get_multithreading_fake(2)));
+  }
+};
+
+TEST_P(YuvpThread, ToBGR) {
+  check(kleidicv_yuv_p_to_bgr_u8, kleidicv_thread_yuv_p_to_bgr_u8, 3);
+}
+TEST_P(YuvpThread, ToBGRA) {
+  check(kleidicv_yuv_p_to_bgra_u8, kleidicv_thread_yuv_p_to_bgra_u8, 4);
+}
+TEST_P(YuvpThread, ToRGB) {
+  check(kleidicv_yuv_p_to_rgb_u8, kleidicv_thread_yuv_p_to_rgb_u8, 3);
+}
+TEST_P(YuvpThread, ToRGBA) {
+  check(kleidicv_yuv_p_to_rgba_u8, kleidicv_thread_yuv_p_to_rgba_u8, 4);
+}
+
+TEST_F(YuvpThread, ReturnsErrorForUnsupportedCombinations) {
+  run_unsupported(kleidicv_thread_yuv_p_to_rgb_u8, 3, true);
+  run_unsupported(kleidicv_thread_yuv_p_to_rgba_u8, 4, true);
+  run_unsupported(kleidicv_thread_yuv_p_to_bgr_u8, 3, true);
+  run_unsupported(kleidicv_thread_yuv_p_to_bgra_u8, 4, true);
+  run_unsupported(kleidicv_thread_yuv_p_to_rgb_u8, 3, false);
+  run_unsupported(kleidicv_thread_yuv_p_to_rgba_u8, 4, false);
+  run_unsupported(kleidicv_thread_yuv_p_to_bgr_u8, 3, false);
+  run_unsupported(kleidicv_thread_yuv_p_to_bgra_u8, 4, false);
+}
+
+INSTANTIATE_TEST_SUITE_P(, YuvpThread,
+                         testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2},
+                                         P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2},
+                                         P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2},
+                                         P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5},
+                                         P{12, 37, 5}, P{16, 37, 5},
+                                         P{2, 1000, 2}));
diff --git a/test/api/test_thread_yuv_sp_to_rgb.cpp b/test/api/test_thread_yuv_sp_to_rgb.cpp
index a358edbd6..694c23bd3 100644
--- a/test/api/test_thread_yuv_sp_to_rgb.cpp
+++ b/test/api/test_thread_yuv_sp_to_rgb.cpp
@@ -20,7 +20,8 @@ class YuvSpThread : public testing::TestWithParam<P> {
  public:
   template <typename SingleThreadedFunc, typename MultithreadedFunc>
   void check(SingleThreadedFunc single_threaded_func,
-             MultithreadedFunc multithreaded_func, size_t channels) {
+             MultithreadedFunc multithreaded_func, size_t channels,
+             bool is_nv21) {
     unsigned width = 0, height = 0, thread_count = 0;
     std::tie(width, height, thread_count) = GetParam();
     test::Array2D<uint8_t> src_y(width, height),
@@ -34,30 +35,77 @@ class YuvSpThread : public testing::TestWithParam<P> {
 
     kleidicv_error_t single_result = single_threaded_func(
         src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
-        dst_single.data(), dst_single.stride(), width, height, false);
+        dst_single.data(), dst_single.stride(), width, height, is_nv21);
 
     kleidicv_error_t multi_result = multithreaded_func(
         src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
-        dst_multi.data(), dst_multi.stride(), width, height, false,
+        dst_multi.data(), dst_multi.stride(), width, height, is_nv21,
         get_multithreading_fake(thread_count));
 
     EXPECT_EQ(KLEIDICV_OK, single_result);
     EXPECT_EQ(KLEIDICV_OK, multi_result);
     EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
   }
+
+  template <typename Func>
+  void run_unsupported(Func impl, size_t channels, bool is_nv21) {
+    test::Array2D<uint8_t> src_y{20, 10};
+    test::Array2D<uint8_t> src_uv{20, 5};
+
+    test::Array2D<uint8_t> dst{20 * channels, 10, 0, channels};
+
+    test::test_null_args(impl, src_y.data(), src_y.stride(), src_uv.data(),
+                         src_uv.stride(), dst.data(), dst.stride(), dst.width(),
+                         dst.height(), is_nv21, get_multithreading_fake(2));
+
+    EXPECT_EQ(KLEIDICV_OK, impl(src_y.data(), src_y.stride(), src_uv.data(),
+                                src_uv.stride(), dst.data(), dst.stride(), 0, 1,
+                                is_nv21, get_multithreading_fake(2)));
+
+    EXPECT_EQ(KLEIDICV_OK, impl(src_y.data(), src_y.stride(), src_uv.data(),
+                                src_uv.stride(), dst.data(), dst.stride(), 1, 0,
+                                is_nv21, get_multithreading_fake(2)));
+
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
+                   dst.data(), dst.stride(), KLEIDICV_MAX_IMAGE_PIXELS + 1, 1,
+                   is_nv21, get_multithreading_fake(2)));
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
+                   dst.data(), dst.stride(), KLEIDICV_MAX_IMAGE_PIXELS,
+                   KLEIDICV_MAX_IMAGE_PIXELS - 1, is_nv21,
+                   get_multithreading_fake(1)));
+  }
 };
 
 TEST_P(YuvSpThread, ToBGR) {
-  check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3);
+  check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3, true);
+  check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3, false);
 }
 TEST_P(YuvSpThread, ToBGRA) {
-  check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4);
+  check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4, true);
+  check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4,
+        false);
 }
 TEST_P(YuvSpThread, ToRGB) {
-  check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3);
+  check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3, true);
+  check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3, false);
 }
 TEST_P(YuvSpThread, ToRGBA) {
-  check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4);
+  check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4, true);
+  check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4,
+        false);
+}
+
+TEST_F(YuvSpThread, ReturnsErrorForUnsupportedCombinations) {
+  run_unsupported(kleidicv_thread_yuv_sp_to_rgb_u8, 3, true);
+  run_unsupported(kleidicv_thread_yuv_sp_to_rgba_u8, 4, true);
+  run_unsupported(kleidicv_thread_yuv_sp_to_bgr_u8, 3, true);
+  run_unsupported(kleidicv_thread_yuv_sp_to_bgra_u8, 4, true);
+  run_unsupported(kleidicv_thread_yuv_sp_to_rgb_u8, 3, false);
+  run_unsupported(kleidicv_thread_yuv_sp_to_rgba_u8, 4, false);
+  run_unsupported(kleidicv_thread_yuv_sp_to_bgr_u8, 3, false);
+  run_unsupported(kleidicv_thread_yuv_sp_to_bgra_u8, 4, false);
 }
 
 INSTANTIATE_TEST_SUITE_P(, YuvSpThread,
diff --git a/test/api/test_yuv_p_to_rgb.cpp b/test/api/test_yuv_p_to_rgb.cpp
new file mode 100644
index 000000000..a1c85de54
--- /dev/null
+++ b/test/api/test_yuv_p_to_rgb.cpp
@@ -0,0 +1,216 @@
+// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <vector>
+
+#include "framework/array.h"
+#include "framework/generator.h"
+#include "kleidicv/kleidicv.h"
+#include "test_config.h"
+
+class YUV420p2RGBTest : public testing::Test {
+ public:
+  struct TestParams {
+    size_t width;
+    size_t src_padding;
+    size_t dst_padding;
+    size_t height;
+    size_t channels;
+    bool v_first;
+    bool is_bgr;
+  };
+
+  static std::vector<TestParams> generate_test_cases(
+      const std::vector<size_t>& widths,
+      const std::vector<size_t>& src_paddings,
+      const std::vector<size_t>& dst_paddings,
+      const std::vector<size_t>& heights, const std::vector<size_t>& channels,
+      const std::vector<bool>& uv_cases,
+      const std::vector<bool>& output_image_case) {
+    std::vector<TestParams> cases;
+
+    for (size_t w : widths) {
+      for (size_t src_pad : src_paddings) {
+        for (size_t dst_pad : dst_paddings) {
+          for (size_t h : heights) {
+            for (size_t c : channels) {
+              for (bool uv_case : uv_cases) {
+                for (bool is_bgr : output_image_case) {
+                  cases.push_back({w, src_pad, dst_pad, h, c, uv_case, is_bgr});
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return cases;
+  }
+
+  static std::vector<TestParams> get_test_cases() {
+    std::vector<size_t> widths = {2, 4, 6, 18, 64};
+    std::vector<size_t> src_paddings = {4};
+    std::vector<size_t> dst_paddings = {0};
+    std::vector<size_t> heights = {2, 5, 11, 16};
+    std::vector<size_t> channels = {3, 4};
+    std::vector<bool> uv_cases = {true, false};
+    std::vector<bool> output_image_case = {true, false};
+    return generate_test_cases(widths, src_paddings, dst_paddings, heights,
+                               channels, uv_cases, output_image_case);
+  }
+
+  void run_test_case(const TestParams& params) {
+    test::Array2D<uint8_t> src{params.width, (params.height * 3 + 1) / 2,
+                               params.src_padding};
+
+    test::Array2D<uint8_t> expected_dst{params.width * params.channels,
+                                        params.height, params.dst_padding,
+                                        params.channels};
+
+    test::Array2D<uint8_t> dst{params.width * params.channels, params.height,
+                               params.dst_padding, params.channels};
+
+    test::PseudoRandomNumberGenerator<uint8_t> input_value_random_range;
+    src.fill(input_value_random_range);
+
+    calculate_referenc(src.data(), src.stride(), expected_dst.data(),
+                       expected_dst.stride(), params.width, params.height,
+                       params.v_first, params.is_bgr, params.channels);
+
+    auto status = KLEIDICV_OK;
+
+    if (params.channels == 3) {
+      if (!params.is_bgr) {
+        status = kleidicv_yuv_p_to_rgb_u8(src.data(), src.stride(), dst.data(),
+                                          dst.stride(), params.width,
+                                          params.height, params.v_first);
+      } else {
+        status = kleidicv_yuv_p_to_bgr_u8(src.data(), src.stride(), dst.data(),
+                                          dst.stride(), params.width,
+                                          params.height, params.v_first);
+      }
+    }
+
+    if (params.channels == 4) {
+      if (!params.is_bgr) {
+        status = kleidicv_yuv_p_to_rgba_u8(src.data(), src.stride(), dst.data(),
+                                           dst.stride(), params.width,
+                                           params.height, params.v_first);
+      } else {
+        status = kleidicv_yuv_p_to_bgra_u8(src.data(), src.stride(), dst.data(),
+                                           dst.stride(), params.width,
+                                           params.height, params.v_first);
+      }
+    }
+
+    EXPECT_EQ(KLEIDICV_OK, status);
+    EXPECT_EQ_ARRAY2D(expected_dst, dst);
+  }
+
+  template <typename Func>
+  void run_unsupported(Func impl, size_t channels, bool v_first) {
+    test::Array2D<uint8_t> src{20, (10 * 3 + 1) / 2};
+
+    test::Array2D<uint8_t> dst{20 * channels, 10, 0, channels};
+
+    test::test_null_args(impl, src.data(), src.stride(), dst.data(),
+                         dst.stride(), dst.width(), dst.height(), v_first);
+
+    EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), dst.data(),
+                                dst.stride(), 0, 1, v_first));
+
+    EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), dst.data(),
+                                dst.stride(), 1, 0, v_first));
+
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src.data(), src.stride(), dst.data(), dst.stride(),
+                   KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, v_first));
+    EXPECT_EQ(
+        KLEIDICV_ERROR_RANGE,
+        impl(src.data(), src.stride(), dst.data(), dst.stride(),
+             KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS, v_first));
+  }
+
+ private:
+  static uint8_t saturate_cast_s32_to_u8(int32_t rhs) {
+    return static_cast<uint8_t>(
+        std::min(std::max(0, rhs),
+                 static_cast<int32_t>(std::numeric_limits<uint8_t>::max())));
+  }
+  void calculate_referenc(const uint8_t* src, size_t src_stride, uint8_t* dst,
+                          size_t dst_stride, size_t width, size_t height,
+                          bool v_first, bool BGR, size_t channels) {
+    // this will the pointer to u plane
+    const uint8_t* u = src + src_stride * height;
+    // this will the pointer to v plane
+    const uint8_t* v = src + src_stride * (height + height / 4) +
+                       (width / 2) * ((height % 4) / 2);
+    size_t ustepIdx = 0;
+    size_t vstepIdx = height % 4 == 2 ? 1 : 0;
+    size_t uvsteps[2] = {width / 2, static_cast<int>(src_stride) - width / 2};
+    size_t usIdx = ustepIdx, vsIdx = vstepIdx;
+    const uint8_t* y1 = src;
+    const uint8_t* u1 = u;
+    const uint8_t* v1 = v;
+    for (size_t h = 0; h < height; h++) {
+      for (size_t w = 0; w < width; w++) {
+        // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign)
+        int32_t y = y1[h * src_stride + w];
+        // NOLINTEND(clang-analyzer-core.uninitialized.Assign)
+        y = std::max(0, y - 16);
+        int32_t u = u1[w >> 1];
+        u -= 128;
+        int32_t v = v1[w >> 1];
+        v -= 128;
+        if (v_first) {
+          std::swap(u, v);
+        }
+        int32_t r = ((1220542 * y) + (1673527 * v) + (1 << 19)) >> 20;
+        int32_t g =
+            ((1220542 * y) - (409993 * u) - (852492 * v) + (1 << 19)) >> 20;
+        int32_t b = ((1220542 * y) + (2116026 * u) + (1 << 19)) >> 20;
+
+        uint8_t r_u8 = saturate_cast_s32_to_u8(r);
+        uint8_t g_u8 = saturate_cast_s32_to_u8(g);
+        uint8_t b_u8 = saturate_cast_s32_to_u8(b);
+
+        if (BGR) {
+          std::swap(b_u8, r_u8);
+        }
+        dst[h * dst_stride + w * channels + 0] = r_u8;
+        dst[h * dst_stride + w * channels + 1] = g_u8;
+        dst[h * dst_stride + w * channels + 2] = b_u8;
+        if (channels == 4) {
+          dst[h * dst_stride + w * channels + 3] = 0xff;
+        }
+      }
+      if ((h % 2) == 1) {
+        u1 += uvsteps[(usIdx++) & 1];
+        v1 += uvsteps[(vsIdx++) & 1];
+      }
+    }
+  }
+};
+
+TEST_F(YUV420p2RGBTest, ConvertspaddedInputsWithAllParamCombinations) {
+  for (const auto& params : get_test_cases()) {
+    run_test_case(params);
+  }
+}
+
+TEST_F(YUV420p2RGBTest, ReturnsErrorForUnsupportedCombinations) {
+  run_unsupported(kleidicv_yuv_p_to_rgb_u8, 3, true);
+  run_unsupported(kleidicv_yuv_p_to_rgba_u8, 4, true);
+  run_unsupported(kleidicv_yuv_p_to_bgr_u8, 3, true);
+  run_unsupported(kleidicv_yuv_p_to_bgra_u8, 4, true);
+  run_unsupported(kleidicv_yuv_p_to_rgb_u8, 3, false);
+  run_unsupported(kleidicv_yuv_p_to_rgba_u8, 4, false);
+  run_unsupported(kleidicv_yuv_p_to_bgr_u8, 3, false);
+  run_unsupported(kleidicv_yuv_p_to_bgra_u8, 4, false);
+}
-- 
GitLab