From a371173bff761c457a8c0f201ef865622b330044 Mon Sep 17 00:00:00 2001
From: Denes Tarjan <denes.tarjan@arm.com>
Date: Fri, 26 Apr 2024 07:23:20 +0000
Subject: [PATCH] Implement RGBToYUV conversion for NEON

---
 adapters/opencv/kleidicv_hal.cpp              |  38 ++++
 adapters/opencv/kleidicv_hal.h                |  15 ++
 benchmark/benchmark.cpp                       |  30 +++
 conformity/opencv/CMakeLists.txt              |   6 +-
 conformity/opencv/test_rgb2yuv.cpp            |  56 +++++
 conformity/opencv/test_rgb2yuv.h              |  14 ++
 conformity/opencv/tests.cpp                   |   4 +-
 .../include/kleidicv/conversions/rgb_to_yuv.h |  63 +++++
 .../include/kleidicv/conversions/yuv_to_rgb.h |   8 +-
 kleidicv/include/kleidicv/kleidicv.h          |  58 ++++-
 kleidicv/src/conversions/rgb_to_yuv_api.cpp   |  16 ++
 kleidicv/src/conversions/rgb_to_yuv_neon.cpp  | 215 ++++++++++++++++++
 test/api/test_rgb_to_yuv.cpp                  | 149 ++++++++++++
 13 files changed, 659 insertions(+), 13 deletions(-)
 create mode 100644 conformity/opencv/test_rgb2yuv.cpp
 create mode 100644 conformity/opencv/test_rgb2yuv.h
 create mode 100644 kleidicv/include/kleidicv/conversions/rgb_to_yuv.h
 create mode 100644 kleidicv/src/conversions/rgb_to_yuv_api.cpp
 create mode 100644 kleidicv/src/conversions/rgb_to_yuv_neon.cpp
 create mode 100644 test/api/test_rgb_to_yuv.cpp
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 84ed53f61..9cb32ae4d 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -173,6 +173,44 @@ int yuv_to_bgr_ex(const uchar *y_data, size_t y_step, const uchar *uv_data,
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
+int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data,
+               size_t dst_step, int width, int height, int depth, int scn,
+               bool swapBlue, bool isCbCr) {
+  const bool is_bgr = !swapBlue;
+
+  if (depth != CV_8U || isCbCr) {
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+  }
+
+  if (scn == 3) {
+    if (is_bgr) {
+      return convert_error(kleidicv_bgr_to_yuv_u8(
+          reinterpret_cast<const uint8_t *>(src_data), src_step,
+          reinterpret_cast<uint8_t *>(dst_data), dst_step,
+          static_cast<size_t>(width), static_cast<size_t>(height)));
+    }
+    return convert_error(kleidicv_rgb_to_yuv_u8(
+        reinterpret_cast<const uint8_t *>(src_data), src_step,
+        reinterpret_cast<uint8_t *>(dst_data), dst_step,
+        static_cast<size_t>(width), static_cast<size_t>(height)));
+  }
+
+  if (scn == 4) {
+    if (is_bgr) {
+      return convert_error(kleidicv_bgra_to_yuv_u8(
+          reinterpret_cast<const uint8_t *>(src_data), src_step,
+          reinterpret_cast<uint8_t *>(dst_data), dst_step,
+          static_cast<size_t>(width), static_cast<size_t>(height)));
+    }
+    return convert_error(kleidicv_rgba_to_yuv_u8(
+        reinterpret_cast<const uint8_t *>(src_data), src_step,
+        reinterpret_cast<uint8_t *>(dst_data), dst_step,
+        static_cast<size_t>(width), static_cast<size_t>(height)));
+  }
+
+  return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
 int threshold(const uchar *src_data, size_t src_step, uchar *dst_data,
               size_t dst_step, int width, int height, int depth, int cn,
               double thresh, double maxValue, int thresholdType) {
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index fbf6720c3..a25be7719 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -37,6 +37,10 @@ int yuv_to_bgr_ex(const uchar *y_data, size_t y_step, const uchar *uv_data,
                   int dst_width, int dst_height, int dcn, bool swapBlue,
                   int uIdx);
 
+int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data,
+               size_t dst_step, int width, int height, int depth, int scn,
+               bool swapBlue, bool isCbCr);
+
 int threshold(const uchar *src_data, size_t src_step, uchar *dst_data,
               size_t dst_step, int width, int height, int depth, int cn,
               double thresh, double maxValue, int thresholdType);
@@ -151,6 +155,17 @@ static inline int kleidicv_yuv_to_bgr_ex_with_fallback(
 #undef cv_hal_cvtTwoPlaneYUVtoBGREx
 #define cv_hal_cvtTwoPlaneYUVtoBGREx kleidicv_yuv_to_bgr_ex_with_fallback
 
+// bgr_to_yuv
+static inline int kleidicv_bgr_to_yuv_with_fallback(
+    const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step,
+    int width, int height, int depth, int scn, bool swapBlue, bool isCbCr) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(bgr_to_yuv, cv_hal_cvtBGRtoYUV, src_data,
+                                       src_step, dst_data, dst_step, width,
+                                       height, depth, scn, swapBlue, isCbCr);
+}
+#undef cv_hal_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUV kleidicv_bgr_to_yuv_with_fallback
+
 // threshold
 static inline int kleidicv_threshold_with_fallback(
     const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step,
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 0286d6339..b66e4d7be 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -41,6 +41,36 @@ BENCH_BINARY_OP(saturating_add_s8, int8_t);
 BENCH_BINARY_OP(saturating_sub_u16, uint16_t);
 BENCH_BINARY_OP(saturating_absdiff_s32, int32_t);
 
+template <typename T, typename Function>
+static void bench_unary_op(Function f, size_t channels,
+                           benchmark::State& state) {
+  // Setup
+  std::vector<T> src, dst;
+  src.resize(image_width * image_height * channels);
+  dst.resize(image_width * image_height * channels);
+
+  std::mt19937 generator;
+  std::generate(src.begin(), src.end(), generator);
+
+  for (auto _ : state) {
+    // This code gets benchmarked
+    auto unused = f(src.data(), image_width, dst.data(), image_width,
+                    image_width, image_height);
+    (void)unused;
+  }
+}
+
+#define BENCH_UNARY_OP(name, channels, type)                \
+  static void name(benchmark::State& state) {               \
+    bench_unary_op<type>(kleidicv_##name, channels, state); \
+  }                                                         \
+  BENCHMARK(name)
+
+BENCH_UNARY_OP(rgb_to_yuv_u8, 3, uint8_t);
+BENCH_UNARY_OP(rgba_to_yuv_u8, 4, uint8_t);
+BENCH_UNARY_OP(bgr_to_yuv_u8, 3, uint8_t);
+BENCH_UNARY_OP(bgra_to_yuv_u8, 4, uint8_t);
+
 static void min_max_loc_u8(benchmark::State& state) {
   // Setup
   std::vector<uint8_t> src;
diff --git a/conformity/opencv/CMakeLists.txt b/conformity/opencv/CMakeLists.txt
index 360b44619..74ea42d6e 100644
--- a/conformity/opencv/CMakeLists.txt
+++ b/conformity/opencv/CMakeLists.txt
@@ -30,8 +30,9 @@ add_executable(
   manager
   manager.cpp
   tests.cpp
-  test_sobel.cpp
   test_gaussian_blur.cpp
+  test_rgb2yuv.cpp
+  test_sobel.cpp
 )
 
 target_link_libraries(
@@ -66,8 +67,9 @@ add_executable(
   subordinate
   subordinate.cpp
   tests.cpp
-  test_sobel.cpp
   test_gaussian_blur.cpp
+  test_rgb2yuv.cpp
+  test_sobel.cpp
 )
 
 target_link_libraries(
diff --git a/conformity/opencv/test_rgb2yuv.cpp b/conformity/opencv/test_rgb2yuv.cpp
new file mode 100644
index 000000000..e6633bf3c
--- /dev/null
+++ b/conformity/opencv/test_rgb2yuv.cpp
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "test_rgb2yuv.h"
+
+#include <vector>
+
+template <bool SwitchBlue>
+cv::Mat exec_rgb2yuv(cv::Mat& input) {
+  cv::Mat result;
+  if constexpr (SwitchBlue) {
+    cv::cvtColor(input, result, cv::COLOR_BGR2YUV);
+  } else {
+    cv::cvtColor(input, result, cv::COLOR_RGB2YUV);
+  }
+  return result;
+}
+
+#if MANAGER
+template <bool SwitchBlue, size_t Channels>
+bool test_rgb2yuv(int index, RecreatedMessageQueue& request_queue,
+                  RecreatedMessageQueue& reply_queue) {
+  cv::RNG rng(0);
+
+  for (size_t x = 5; x <= 16; ++x) {
+    for (size_t y = 5; y <= 16; ++y) {
+      cv::Mat input(x, y, CV_8UC(Channels));
+      rng.fill(input, cv::RNG::UNIFORM, 0, 255);
+
+      cv::Mat actual = exec_rgb2yuv<SwitchBlue>(input);
+      cv::Mat expected = get_expected_from_subordinate(index, request_queue,
+                                                       reply_queue, input);
+
+      if (are_matrices_different<uint8_t>(0, actual, expected)) {
+        fail_print_matrices(x, y, input, actual, expected);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+#endif
+
+std::vector<test>& rgb2yuv_tests_get() {
+  // clang-format off
+  static std::vector<test> tests = {
+    TEST("RGB2YUV", (test_rgb2yuv<false, 3>), exec_rgb2yuv<false>),
+    TEST("RGBA2YUV", (test_rgb2yuv<false, 4>), exec_rgb2yuv<false>),
+    TEST("BGR2YUV", (test_rgb2yuv<true, 3>), exec_rgb2yuv<true>),
+    TEST("BGRA2YUV", (test_rgb2yuv<true, 4>), exec_rgb2yuv<true>),
+  };
+  // clang-format on
+  return tests;
+}
diff --git a/conformity/opencv/test_rgb2yuv.h b/conformity/opencv/test_rgb2yuv.h
new file mode 100644
index 000000000..03d5687be
--- /dev/null
+++ b/conformity/opencv/test_rgb2yuv.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_CONFORMITY_TEST_RGB2YUV_H_
+#define KLEIDICV_OPENCV_CONFORMITY_TEST_RGB2YUV_H_
+
+#include <vector>
+
+#include "tests.h"
+
+std::vector<test>& rgb2yuv_tests_get();
+
+#endif  // KLEIDICV_OPENCV_CONFORMITY_TEST_RGB2YUV_H_
diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp
index e5fd49e6f..2c02e39fb 100644
--- a/conformity/opencv/tests.cpp
+++ b/conformity/opencv/tests.cpp
@@ -11,6 +11,7 @@
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "test_gaussian_blur.h"
+#include "test_rgb2yuv.h"
 #include "test_sobel.h"
 
 static std::vector<test> merge_tests(
@@ -24,8 +25,9 @@ static std::vector<test> merge_tests(
 }
 
 std::vector<test> all_tests = merge_tests({
-    sobel_tests_get,
     gaussian_blur_tests_get,
+    rgb2yuv_tests_get,
+    sobel_tests_get,
 });
 
 #if MANAGER
diff --git a/kleidicv/include/kleidicv/conversions/rgb_to_yuv.h b/kleidicv/include/kleidicv/conversions/rgb_to_yuv.h
new file mode 100644
index 000000000..c45f7adc8
--- /dev/null
+++ b/kleidicv/include/kleidicv/conversions/rgb_to_yuv.h
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_CONVERSIONS_RGB_TO_YUV_H
+#define KLEIDICV_CONVERSIONS_RGB_TO_YUV_H
+
+#include "kleidicv/kleidicv.h"
+
+namespace kleidicv {
+
+/*
+Analog YUV to RGB conversion according to ITU-R BT.601-7:
+
+  Y = 0.299 * R + 0.587 * G + 0.114 * B;
+  U = (B - Y) / 1.772 = (B - Y) * 0,5643340858
+  V = (R - Y) / 1.402 = (R - Y) * 0,7132667618
+
+With 14-bit scaling and rounding, the integer constants are:
+
+  Y = 4899 * R + 9617 * G + 1868 * B
+  U = (B - Y) * 8061
+  V = (R - Y) * 14369
+
+The final results are calculated using rounding shift right and saturating
+to 8-bit unsigned values:
+
+  X = saturating_cast<uint8_t>((X' + (1 << 13)) >> 14)
+
+Sources:
+  [1] https://www.itu.int/rec/R-REC-BT.601
+*/
+
+// Weights according to the calculation at the top of this file.
+static constexpr size_t kWeightScale = 14;
+static constexpr int16_t kRYWeight = 4899;
+static constexpr int16_t kGYWeight = 9617;
+static constexpr int16_t kBYWeight = 1868;
+static constexpr int16_t kRVWeight = 14369;
+static constexpr int16_t kBUWeight = 8061;
+
+namespace neon {
+
+kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                               uint8_t *dst, size_t dst_stride, size_t width,
+                               size_t height);
+
+kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                               uint8_t *dst, size_t dst_stride, size_t width,
+                               size_t height);
+
+kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                                uint8_t *dst, size_t dst_stride, size_t width,
+                                size_t height);
+
+kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                                uint8_t *dst, size_t dst_stride, size_t width,
+                                size_t height);
+}  // namespace neon
+
+}  // namespace kleidicv
+
+#endif  // KLEIDICV_CONVERSIONS_RGB_TO_YUV_H
diff --git a/kleidicv/include/kleidicv/conversions/yuv_to_rgb.h b/kleidicv/include/kleidicv/conversions/yuv_to_rgb.h
index 3ec746e94..cfe77e796 100644
--- a/kleidicv/include/kleidicv/conversions/yuv_to_rgb.h
+++ b/kleidicv/include/kleidicv/conversions/yuv_to_rgb.h
@@ -18,7 +18,7 @@ namespace kleidicv {
 After re-normalization of the analog signal:
 
   Yan = Ya
-  Uan = Ua / 1.722
+  Uan = Ua / 1.772
   Van = Va / 1.402
 
   [ Yan ] = [  0.299000  0.587000  0.114000 ] [ Ra ]
@@ -51,14 +51,14 @@ The values used in this implementation are the following:
   [ G ] = [ 1.164000 -0.391000 -0.813000 ] [ U' ]
   [ B ] = [ 1.164000  2.018000  0.000000 ] [ V' ]
 
-With 20 bit scaling and rounding, the integer constants are:
+With 20-bit scaling and rounding, the integer constants are:
 
   [ R ] = [ 1,220,542          0   1,673,527 ] [ Y' ]
   [ G ] = [ 1,220,542  - 409,993    -852,492 ] [ U' ] + (1 << 19) >> 20
   [ B ] = [ 1,220,542  2,116,026           0 ] [ V' ]
 
-The final results are calculated using roundign shift right and saturating
-to 8 bit unsigned values:
+The final results are calculated using rounding shift right and saturating
+to 8-bit unsigned values:
 
   X = saturating_cast<uint8_t>((X' + (1 << 19)) >> 20)
 
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index f4052e6a8..fb69d3233 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -510,7 +510,8 @@ KLEIDICV_API_DECLARATION(kleidicv_rgba_to_rgb_u8, const uint8_t *src,
                          size_t src_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height);
 
-/// Converts an NV12 or NV21 YUV image to RGB. All channels are 8-bit wide.
+/// Converts an NV12 or NV21 (Semi-Planar) YUV image to RGB. All channels are
+/// 8-bit wide.
 ///
 /// Destination data is filled liked this:
 /// | R,G,B | R,G,B | R,G,B | ...
@@ -547,7 +548,8 @@ KLEIDICV_API_DECLARATION(kleidicv_yuv_sp_to_rgb_u8, const uint8_t *src_y,
                          size_t src_uv_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, bool is_nv21);
 
-/// Converts an NV12 or NV21 YUV image to BGR. All channels are 8-bit wide.
+/// Converts an NV12 or NV21 (Semi-Planar) YUV image to BGR. All channels are
+/// 8-bit wide.
 ///
 /// Destination data is filled liked this:
 /// | B,G,R | B,G,R | B,G,R | ...
@@ -584,8 +586,8 @@ KLEIDICV_API_DECLARATION(kleidicv_yuv_sp_to_bgr_u8, const uint8_t *src_y,
                          size_t src_uv_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, bool is_nv21);
 
-/// Converts an NV12 or NV21 YUV image to RGBA. All channels are 8-bit wide.
-/// Alpha channel is set to 0xFF.
+/// Converts an NV12 or NV21 (Semi-Planar) YUV image to RGBA. All channels are
+/// 8-bit wide. Alpha channel is set to 0xFF.
 ///
 /// Destination data is filled liked this:
 /// | R,G,B,A | R,G,B,A | R,G,B,A | ...
@@ -620,8 +622,8 @@ KLEIDICV_API_DECLARATION(kleidicv_yuv_sp_to_rgba_u8, const uint8_t *src_y,
                          size_t src_uv_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, bool is_nv21);
 
-/// Converts an NV12 or NV21 YUV image to BGRA. All channels are 8-bit wide.
-/// Alpha channel is set to 0xFF.
+/// Converts an NV12 or NV21 (Semi-Planar) YUV image to BGRA. All channels are
+/// 8-bit wide. Alpha channel is set to 0xFF.
 ///
 /// Destination data is filled liked this:
 /// | B,G,R,A | B,G,R,A | B,G,R,A | ...
@@ -656,6 +658,50 @@ KLEIDICV_API_DECLARATION(kleidicv_yuv_sp_to_bgra_u8, const uint8_t *src_y,
                          size_t src_uv_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, bool is_nv21);
 
+/// Converts an RGB image to YUV, pixel by pixel. All channels are 8-bit wide.
+///
+/// Source data can have 3 channels or 4 if there is an alpha channel:
+/// - R,G,B
+/// - R,G,B,Alpha
+/// - B,G,R
+/// - B,G,R,Alpha
+///
+/// Destination data is filled liked this:
+/// | Y,U,V | Y,U,V | Y,U,V | ...
+/// One pixel is represented by 3 bytes. There is no padding between the pixels.
+/// Alpha channel is not used in the conversion.
+///
+/// Width and height are the same for the source and for the destination. Number
+/// of pixels is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS.
+///
+/// @param src         Pointer to the source data. Must be non-null.
+/// @param src_stride  Distance in bytes from the start of one row to the
+///                    start of the next row for the source data.
+///                    Must not be less than width * (number of channels) *
+///                    sizeof(uint8).
+/// @param dst         Pointer to the destination data. Must be non-null.
+/// @param dst_stride  Distance in bytes from the start of one row to the
+///                    start of the next row for the destination data.
+///                    Must not be less than width * 3 * sizeof(uint8).
+/// @param width       Number of pixels in a row.
+/// @param height      Number of rows in the data.
+///
+KLEIDICV_API_DECLARATION(kleidicv_bgr_to_yuv_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height);
+/// @copydoc kleidicv_bgr_to_yuv_u8
+KLEIDICV_API_DECLARATION(kleidicv_rgb_to_yuv_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height);
+/// @copydoc kleidicv_bgr_to_yuv_u8
+KLEIDICV_API_DECLARATION(kleidicv_bgra_to_yuv_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height);
+/// @copydoc kleidicv_bgr_to_yuv_u8
+KLEIDICV_API_DECLARATION(kleidicv_rgba_to_yuv_u8, const uint8_t *src,
+                         size_t src_stride, uint8_t *dst, size_t dst_stride,
+                         size_t width, size_t height);
+
 /// Performs a comparison of each element's value in `src` with respect to a
 /// caller defined threshold. The strictly larger elements are set to
 /// `value` and the rest to 0.
diff --git a/kleidicv/src/conversions/rgb_to_yuv_api.cpp b/kleidicv/src/conversions/rgb_to_yuv_api.cpp
new file mode 100644
index 000000000..baa4a3f39
--- /dev/null
+++ b/kleidicv/src/conversions/rgb_to_yuv_api.cpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/conversions/rgb_to_yuv.h"
+#include "kleidicv/dispatch.h"
+#include "kleidicv/kleidicv.h"
+
+#define KLEIDICV_DEFINE_C_API(name, partialname)                           \
+  KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::partialname, nullptr, \
+                              nullptr)
+
+KLEIDICV_DEFINE_C_API(kleidicv_rgb_to_yuv_u8, rgb_to_yuv_u8);
+KLEIDICV_DEFINE_C_API(kleidicv_bgr_to_yuv_u8, bgr_to_yuv_u8);
+KLEIDICV_DEFINE_C_API(kleidicv_rgba_to_yuv_u8, rgba_to_yuv_u8);
+KLEIDICV_DEFINE_C_API(kleidicv_bgra_to_yuv_u8, bgra_to_yuv_u8);
diff --git a/kleidicv/src/conversions/rgb_to_yuv_neon.cpp b/kleidicv/src/conversions/rgb_to_yuv_neon.cpp
new file mode 100644
index 000000000..0b0dd6430
--- /dev/null
+++ b/kleidicv/src/conversions/rgb_to_yuv_neon.cpp
@@ -0,0 +1,215 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/conversions/rgb_to_yuv.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/neon.h"
+
+namespace kleidicv::neon {
+
+template <bool BGR, bool ALPHA>
+class RGBToYUVAll final : public UnrollOnce, public TryToAvoidTailLoop {
+ public:
+  using VecTraits = neon::VecTraits<uint8_t>;
+  using ScalarType = VecTraits::ScalarType;
+  using VectorType = VecTraits::VectorType;
+  using RawSourceVectorType =
+      typename std::conditional<ALPHA, uint8x16x4_t, uint8x16x3_t>::type;
+
+  explicit RGBToYUVAll() = default;
+
+  // Returns the number of channels in the output image.
+  static constexpr size_t input_channels() {
+    return ALPHA ? /* RGBA */ 4 : /* RGB */ 3;
+  }
+
+  void vector_path(const ScalarType *src, ScalarType *dst) {
+    RawSourceVectorType vsrc;
+    int16x8_t r_l, r_h, g_l, g_h, b_l, b_h;
+    if constexpr (ALPHA) {
+      vsrc = vld1q_u8_x4(src);
+      uint16x8_t rb_l = vuzp1q_u8(vsrc.val[0], vsrc.val[1]);
+      uint16x8_t rb_h = vuzp1q_u8(vsrc.val[2], vsrc.val[3]);
+      if constexpr (BGR) {
+        b_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
+        b_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
+        r_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
+        r_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
+      } else {
+        r_l = vreinterpretq_s16_u8(vtrn1q_u8(rb_l, vdupq_n_u8(0)));
+        r_h = vreinterpretq_s16_u8(vtrn1q_u8(rb_h, vdupq_n_u8(0)));
+        b_l = vreinterpretq_s16_u8(vtrn2q_u8(rb_l, vdupq_n_u8(0)));
+        b_h = vreinterpretq_s16_u8(vtrn2q_u8(rb_h, vdupq_n_u8(0)));
+      }
+      uint16x8_t ga_l = vuzp2q_u8(vsrc.val[0], vsrc.val[1]);
+      g_l = vreinterpretq_s16_u8(vtrn1q_u8(ga_l, vdupq_n_u8(0)));
+      uint16x8_t ga_h = vuzp2q_u8(vsrc.val[2], vsrc.val[3]);
+      g_h = vreinterpretq_s16_u8(vtrn1q_u8(ga_h, vdupq_n_u8(0)));
+    } else {
+      // Load deinterleaved
+      vsrc = vld3q_u8(src);
+      r_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
+      r_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[r_index_], vdupq_n_u8(0)));
+      g_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
+      g_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[g_index_], vdupq_n_u8(0)));
+      b_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
+      b_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[b_index_], vdupq_n_u8(0)));
+    }
+    // Compute Y value in 32-bit precision
+    int16x8_t y_l, y_h;
+    {
+      int32x4_t y_ll = vmull_n_s16(vget_low_s16(r_l), kRYWeight);
+      int32x4_t y_hl = vmull_n_s16(vget_low_s16(r_h), kRYWeight);
+      int32x4_t y_lh = vmull_high_n_s16(r_l, kRYWeight);
+      int32x4_t y_hh = vmull_high_n_s16(r_h, kRYWeight);
+
+      y_ll = vmlal_n_s16(y_ll, vget_low_s16(g_l), kGYWeight);
+      y_hl = vmlal_n_s16(y_hl, vget_low_s16(g_h), kGYWeight);
+      y_lh = vmlal_high_n_s16(y_lh, g_l, kGYWeight);
+      y_hh = vmlal_high_n_s16(y_hh, g_h, kGYWeight);
+
+      y_ll = vmlal_n_s16(y_ll, vget_low_s16(b_l), kBYWeight);
+      y_hl = vmlal_n_s16(y_hl, vget_low_s16(b_h), kBYWeight);
+      y_lh = vmlal_high_n_s16(y_lh, b_l, kBYWeight);
+      y_hh = vmlal_high_n_s16(y_hh, b_h, kBYWeight);
+
+      y_l = combine_scaled_s16(y_ll, y_lh);
+      y_h = combine_scaled_s16(y_hl, y_hh);
+    }
+
+    // Using the 16-bit Y value, calculate U
+    int16x8_t u_l, u_h;
+    {
+      int16x8_t uy_l = vqsubq(b_l, y_l);
+      int16x8_t uy_h = vqsubq(b_h, y_h);
+
+      int32x4_t u_ll = vdupq_n_s32(half_);
+      int32x4_t u_lh = u_ll;
+      int32x4_t u_hl = u_ll;
+      int32x4_t u_hh = u_ll;
+
+      u_ll = vmlal_n_s16(u_ll, vget_low_s16(uy_l), kBUWeight);
+      u_hl = vmlal_n_s16(u_hl, vget_low_s16(uy_h), kBUWeight);
+      u_lh = vmlal_high_n_s16(u_lh, uy_l, kBUWeight);
+      u_hh = vmlal_high_n_s16(u_hh, uy_h, kBUWeight);
+
+      u_l = combine_scaled_s16(u_ll, u_lh);
+      u_h = combine_scaled_s16(u_hl, u_hh);
+    }
+
+    // Using the 16-bit Y value, calculate V
+    int16x8_t v_l, v_h;
+    {
+      int16x8_t vy_l = vqsubq(r_l, y_l);
+      int16x8_t vy_h = vqsubq(r_h, y_h);
+
+      int32x4_t v_ll = vdupq_n_s32(half_);
+      int32x4_t v_lh = v_ll;
+      int32x4_t v_hl = v_ll;
+      int32x4_t v_hh = v_ll;
+
+      v_ll = vmlal_n_s16(v_ll, vget_low_s16(vy_l), kRVWeight);
+      v_hl = vmlal_n_s16(v_hl, vget_low_s16(vy_h), kRVWeight);
+      v_lh = vmlal_high_n_s16(v_lh, vy_l, kRVWeight);
+      v_hh = vmlal_high_n_s16(v_hh, vy_h, kRVWeight);
+
+      v_l = combine_scaled_s16(v_ll, v_lh);
+      v_h = combine_scaled_s16(v_hl, v_hh);
+    }
+
+    // Narrow the results to 8 bits
+    uint8x16x3_t yuv;
+    yuv.val[0] = vcombine_u8(vqmovun_s16(y_l), vqmovun_s16(y_h));
+    yuv.val[1] = vcombine_u8(vqmovun_s16(u_l), vqmovun_s16(u_h));
+    yuv.val[2] = vcombine_u8(vqmovun_s16(v_l), vqmovun_s16(v_h));
+
+    // Store interleaved YUV pixels to memory.
+    vst3q_u8(dst, yuv);
+  }
+
+  void scalar_path(const ScalarType *src, ScalarType *dst) {
+    int32_t y = src[r_index_] * kRYWeight + src[g_index_] * kGYWeight +
+                src[b_index_] * kBYWeight;
+    y = rounding_shift_right(y, kWeightScale);
+    int32_t u = (src[b_index_] - y) * kBUWeight + half_;
+    u = rounding_shift_right(u, kWeightScale);
+    int32_t v = (src[r_index_] - y) * kRVWeight + half_;
+    v = rounding_shift_right(v, kWeightScale);
+    dst[0] = saturating_cast<int32_t, uint8_t>(y);
+    dst[1] = saturating_cast<int32_t, uint8_t>(u);
+    dst[2] = saturating_cast<int32_t, uint8_t>(v);
+  }
+
+ private:
+  static constexpr size_t r_index_ = BGR ? 2 : 0;
+  static constexpr size_t g_index_ = 1;
+  static constexpr size_t b_index_ = BGR ? 0 : 2;
+  static constexpr size_t step_ = ALPHA ? 4 : 3;
+  static constexpr uint32_t half_ =
+      (std::numeric_limits<uint8_t>::max() / 2 + 1U) << kWeightScale;
+
+  static int16x8_t combine_scaled_s16(int32x4_t a, int32x4_t b) {
+    return vrshrn_high_n_s32(vrshrn_n_s32(a, kWeightScale), b, kWeightScale);
+  }
+};  // end of class RGBToYUVAll<bool BGR, bool ALPHA>
+
+template <typename OperationType, typename ScalarType>
+kleidicv_error_t rgb2yuv_operation(OperationType &operation,
+                                   const ScalarType *src, size_t src_stride,
+                                   ScalarType *dst, size_t dst_stride,
+                                   size_t width, size_t height) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride);
+  CHECK_IMAGE_SIZE(width, height);
+
+  Rectangle rect{width, height};
+  Rows src_rows{src, src_stride, operation.input_channels()};
+  Rows dst_rows{dst, dst_stride, 3};
+
+  apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  return KLEIDICV_OK;
+}
+
+using RGBToYUV = RGBToYUVAll<false, false>;
+using RGBAToYUV = RGBToYUVAll<false, true>;
+using BGRToYUV = RGBToYUVAll<true, false>;
+using BGRAToYUV = RGBToYUVAll<true, true>;
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t rgb_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                               uint8_t *dst, size_t dst_stride, size_t width,
+                               size_t height) {
+  RGBToYUV operation;
+  return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
+                           height);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t rgba_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                                uint8_t *dst, size_t dst_stride, size_t width,
+                                size_t height) {
+  RGBAToYUV operation;
+  return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
+                           height);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t bgr_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                               uint8_t *dst, size_t dst_stride, size_t width,
+                               size_t height) {
+  BGRToYUV operation;
+  return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
+                           height);
+}
+
+KLEIDICV_TARGET_FN_ATTRS
+kleidicv_error_t bgra_to_yuv_u8(const uint8_t *src, size_t src_stride,
+                                uint8_t *dst, size_t dst_stride, size_t width,
+                                size_t height) {
+  BGRAToYUV operation;
+  return rgb2yuv_operation(operation, src, src_stride, dst, dst_stride, width,
+                           height);
+}
+
+}  // namespace kleidicv::neon
diff --git a/test/api/test_rgb_to_yuv.cpp b/test/api/test_rgb_to_yuv.cpp
new file mode 100644
index 000000000..0a848ae36
--- /dev/null
+++ b/test/api/test_rgb_to_yuv.cpp
@@ -0,0 +1,149 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "framework/array.h"
+#include "framework/utils.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/utils.h"
+#include "test_config.h"
+
+class RGB2YUVTest final {
+ public:
+  RGB2YUVTest(size_t channel_number, bool switch_blue)
+      : channel_number_(channel_number), switch_blue_(switch_blue) {}
+
+  template <typename F>
+  void execute_scalar_test(F impl) {
+    size_t scalar_path_width = 5;
+    execute_test(impl, scalar_path_width, 0);
+    // Padding version
+    execute_test(impl, scalar_path_width, test::Options::vector_length());
+  }
+
+  template <typename F>
+  void execute_vector_test(F impl) {
+    size_t vector_path_width = (2 * test::Options::vector_lanes<uint8_t>()) - 3;
+    execute_test(impl, vector_path_width, 0);
+    // Padding version
+    execute_test(impl, vector_path_width, test::Options::vector_length());
+  }
+
+ private:
+  template <typename F>
+  void execute_test(F impl, size_t logical_width, size_t padding) {
+    test::Array2D<uint8_t> src{logical_width * channel_number_, 5, padding};
+    src.fill(0);
+    src.set(0, 0, {0, 22, 4, 0, 27, 9, 255, 125, 255, 255, 60, 255});
+    src.set(1, 0, {0, 154, 0, 0, 154, 0, 61, 255, 11, 47, 255, 0});
+    src.set(2, 0, {0, 22, 4, 76, 143, 125, 203, 0, 255, 204, 0, 255});
+    src.set(4, 0, {0, 145, 0, 0, 145, 0, 0, 255, 0, 0, 255, 0});
+
+    test::Array2D<uint8_t> expected{logical_width * 3, src.height(), padding};
+    expected.fill(0);
+    calculate_expected(src, expected);
+
+    test::Array2D<uint8_t> actual{logical_width * 3, src.height(), padding};
+    actual.fill(42);
+    auto err = impl(src.data(), src.stride(), actual.data(), actual.stride(),
+                    expected.width() / 3, expected.height());
+
+    ASSERT_EQ(KLEIDICV_OK, err);
+    EXPECT_EQ_ARRAY2D(expected, actual);
+
+    test::test_null_args(impl, src.data(), src.stride(), actual.data(),
+                         actual.stride(), expected.width() / channel_number_,
+                         expected.height());
+
+    EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), actual.data(),
+                                actual.stride(), 0, 1));
+    EXPECT_EQ(KLEIDICV_OK, impl(src.data(), src.stride(), actual.data(),
+                                actual.stride(), 1, 0));
+
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src.data(), src.stride(), actual.data(), actual.stride(),
+                   KLEIDICV_MAX_IMAGE_PIXELS + 1, 1));
+    EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+              impl(src.data(), src.stride(), actual.data(), actual.stride(),
+                   KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS));
+  }
+
+  void calculate_expected(test::Array2D<uint8_t> &src_arr,
+                          test::Array2D<uint8_t> &exp_arr) const {
+    for (size_t vindex = 0; vindex < exp_arr.height(); vindex++) {
+      for (size_t hindex = 0; hindex < exp_arr.width() / 3; hindex++) {
+        // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign)
+        int32_t r = *src_arr.at(
+            vindex, hindex * channel_number_ + (switch_blue_ ? 2 : 0));
+        int32_t g = *src_arr.at(vindex, hindex * channel_number_ + 1);
+        int32_t b = *src_arr.at(
+            vindex, hindex * channel_number_ + (switch_blue_ ? 0 : 2));
+        // NOLINTEND(clang-analyzer-core.uninitialized.Assign)
+
+        static const int32_t R2Y = 4899, G2Y = 9617, B2Y = 1868;
+        static const int32_t R2V = 14369, B2U = 8061;
+
+        int32_t y = (r * R2Y + g * G2Y + b * B2Y + (1 << 13)) >> 14;
+        int32_t u = (((b - y) * B2U + (1 << 13)) >> 14) + 128;
+        int32_t v = (((r - y) * R2V + (1 << 13)) >> 14) + 128;
+
+        uint8_t y_u8 = saturate_cast_s32_to_u8(y);
+        uint8_t u_u8 = saturate_cast_s32_to_u8(u);
+        uint8_t v_u8 = saturate_cast_s32_to_u8(v);
+
+        exp_arr.set(vindex, hindex * 3, {y_u8, u_u8, v_u8});
+      }
+    }
+  }
+
+  static uint8_t saturate_cast_s32_to_u8(int32_t rhs) {
+    return static_cast<uint8_t>(
+        std::min(std::max(0, rhs),
+                 static_cast<int32_t>(std::numeric_limits<uint8_t>::max())));
+  }
+
+  size_t channel_number_;
+  bool switch_blue_;
+};
+
+TEST(RGBToYUV, RGB_YUV_scalar) {
+  RGB2YUVTest rgb2yuv_test(3, false);
+  rgb2yuv_test.execute_scalar_test(kleidicv_rgb_to_yuv_u8);
+}
+
+TEST(RGBToYUV, RGB_YUV_vector) {
+  RGB2YUVTest rgb2yuv_test(3, false);
+  rgb2yuv_test.execute_vector_test(kleidicv_rgb_to_yuv_u8);
+}
+
+TEST(RGBToYUV, RGBA_YUV_scalar) {
+  RGB2YUVTest rgb2yuv_test(4, false);
+  rgb2yuv_test.execute_scalar_test(kleidicv_rgba_to_yuv_u8);
+}
+
+TEST(RGBToYUV, RGBA_YUV_vector) {
+  RGB2YUVTest rgb2yuv_test(4, false);
+  rgb2yuv_test.execute_vector_test(kleidicv_rgba_to_yuv_u8);
+}
+
+TEST(RGBToYUV, BGR_YUV_scalar) {
+  RGB2YUVTest rgb2yuv_test(3, true);
+  rgb2yuv_test.execute_scalar_test(kleidicv_bgr_to_yuv_u8);
+}
+
+TEST(RGBToYUV, BGR_YUV_vector) {
+  RGB2YUVTest rgb2yuv_test(3, true);
+  rgb2yuv_test.execute_vector_test(kleidicv_bgr_to_yuv_u8);
+}
+
+TEST(RGBToYUV, BGRA_YUV_scalar) {
+  RGB2YUVTest rgb2yuv_test(4, true);
+  rgb2yuv_test.execute_scalar_test(kleidicv_bgra_to_yuv_u8);
+}
+
+TEST(RGBToYUV, BGRA_YUV_vector) {
+  RGB2YUVTest rgb2yuv_test(4, true);
+  rgb2yuv_test.execute_vector_test(kleidicv_bgra_to_yuv_u8);
+}
-- 
GitLab