diff --git a/CHANGELOG.md b/CHANGELOG.md
index e014dfea25c8e187f679fbba75bcffd4f339a125..15f628c5db436d1a59b3af59069a6c1776618b0c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,8 +36,18 @@ This changelog aims to follow the guiding principles of
 ### Changed
 - Filter context creation API specification.
 - Gaussian Blur API specification.
-- In the OpenCV HAL, cvtColor YUV2RGB_NV21 is multithreaded.
-- In the OpenCV HAL, minMaxIdx is multithreaded.
+- In the OpenCV HAL, the following operations are multithreaded:
+  * cvtColor
+  * threshold
+  * convertTo
+  * exp
+  * compare
+  * add
+  * sub
+  * mul
+  * absdiff
+  * bitwise_and
+  * minMaxIdx
 - Improved performance of Compare Equal and Greater SC API.
 
 ### Removed
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 3b3b49c7e6bfdca2cfa720c44ef09a9e11351d53..807c8e324ff5a26b1d663b564bca188e1a1b5f2d 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -68,7 +68,7 @@ static kleidicv_error_t parallel(kleidicv_thread_callback callback,
   return shared_result;
 }
 
-static kleidicv_thread_multithreading get_multithreading() {
+kleidicv_thread_multithreading get_multithreading() {
   return kleidicv_thread_multithreading{parallel, nullptr};
 }
 
@@ -79,15 +79,17 @@ int gray_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
+  auto mt = get_multithreading();
+
   if (depth == CV_8U) {
     if (dcn == 3) {
-      return convert_error(kleidicv_gray_to_rgb_u8(
+      return convert_error(kleidicv_thread_gray_to_rgb_u8(
           reinterpret_cast<const uint8_t *>(src_data), src_step,
-          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height, mt));
     }
-    return convert_error(kleidicv_gray_to_rgba_u8(
+    return convert_error(kleidicv_thread_gray_to_rgba_u8(
         reinterpret_cast<const uint8_t *>(src_data), src_step,
-        reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+        reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height, mt));
   }
 
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -101,27 +103,31 @@ int bgr_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
+  auto mt = get_multithreading();
+
   if (depth == CV_8U) {
     if (scn == 3 && dcn == 3) {
       if (swapBlue) {
-        return convert_error(kleidicv_rgb_to_bgr_u8(
+        return convert_error(kleidicv_thread_rgb_to_bgr_u8(
             reinterpret_cast<const uint8_t *>(src_data), src_step,
-            reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+            reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height,
+            mt));
       }
-      return convert_error(kleidicv_rgb_to_rgb_u8(
+      return convert_error(kleidicv_thread_rgb_to_rgb_u8(
           reinterpret_cast<const uint8_t *>(src_data), src_step,
-          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height, mt));
     }
 
     if (scn == 4 && dcn == 4) {
       if (swapBlue) {
-        return convert_error(kleidicv_rgba_to_bgra_u8(
+        return convert_error(kleidicv_thread_rgba_to_bgra_u8(
             reinterpret_cast<const uint8_t *>(src_data), src_step,
-            reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+            reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height,
+            mt));
       }
-      return convert_error(kleidicv_rgba_to_rgba_u8(
+      return convert_error(kleidicv_thread_rgba_to_rgba_u8(
           reinterpret_cast<const uint8_t *>(src_data), src_step,
-          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height, mt));
     }
   }
 
@@ -144,34 +150,36 @@ int yuv_to_bgr_sp_ex(const uchar *y_data, size_t y_step, const uchar *uv_data,
   const bool is_bgr = !swapBlue;
   const bool is_nv21 = (uIdx != 0);
 
+  auto mt = get_multithreading();
+
   if (dcn == 3) {
     if (is_bgr) {
-      return convert_error(kleidicv_yuv_sp_to_bgr_u8(
+      return convert_error(kleidicv_thread_yuv_sp_to_bgr_u8(
           reinterpret_cast<const uint8_t *>(y_data), y_step,
           reinterpret_cast<const uint8_t *>(uv_data), uv_step,
           reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width,
-          dst_height, is_nv21));
+          dst_height, is_nv21, mt));
     }
     return convert_error(kleidicv_thread_yuv_sp_to_rgb_u8(
         reinterpret_cast<const uint8_t *>(y_data), y_step,
         reinterpret_cast<const uint8_t *>(uv_data), uv_step,
         reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width, dst_height,
-        is_nv21, get_multithreading()));
+        is_nv21, mt));
   }
 
   if (dcn == 4) {
     if (is_bgr) {
-      return convert_error(kleidicv_yuv_sp_to_bgra_u8(
+      return convert_error(kleidicv_thread_yuv_sp_to_bgra_u8(
           reinterpret_cast<const uint8_t *>(y_data), y_step,
           reinterpret_cast<const uint8_t *>(uv_data), uv_step,
           reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width,
-          dst_height, is_nv21));
+          dst_height, is_nv21, mt));
     }
-    return convert_error(kleidicv_yuv_sp_to_rgba_u8(
+    return convert_error(kleidicv_thread_yuv_sp_to_rgba_u8(
         reinterpret_cast<const uint8_t *>(y_data), y_step,
         reinterpret_cast<const uint8_t *>(uv_data), uv_step,
         reinterpret_cast<uint8_t *>(dst_data), dst_step, dst_width, dst_height,
-        is_nv21));
+        is_nv21, mt));
   }
 
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -186,16 +194,18 @@ int yuv_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
+  auto mt = get_multithreading();
+
   if (is_bgr) {
-    return convert_error(kleidicv_yuv_to_bgr_u8(
+    return convert_error(kleidicv_thread_yuv_to_bgr_u8(
         reinterpret_cast<const uint8_t *>(src_data), src_step,
         reinterpret_cast<uint8_t *>(dst_data), dst_step,
-        static_cast<size_t>(width), static_cast<size_t>(height)));
+        static_cast<size_t>(width), static_cast<size_t>(height), mt));
   }
-  return convert_error(kleidicv_yuv_to_rgb_u8(
+  return convert_error(kleidicv_thread_yuv_to_rgb_u8(
       reinterpret_cast<const uint8_t *>(src_data), src_step,
       reinterpret_cast<uint8_t *>(dst_data), dst_step,
-      static_cast<size_t>(width), static_cast<size_t>(height)));
+      static_cast<size_t>(width), static_cast<size_t>(height), mt));
 }
 
 int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data,
@@ -207,30 +217,32 @@ int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
+  auto mt = get_multithreading();
+
   if (scn == 3) {
     if (is_bgr) {
-      return convert_error(kleidicv_bgr_to_yuv_u8(
+      return convert_error(kleidicv_thread_bgr_to_yuv_u8(
           reinterpret_cast<const uint8_t *>(src_data), src_step,
           reinterpret_cast<uint8_t *>(dst_data), dst_step,
-          static_cast<size_t>(width), static_cast<size_t>(height)));
+          static_cast<size_t>(width), static_cast<size_t>(height), mt));
     }
-    return convert_error(kleidicv_rgb_to_yuv_u8(
+    return convert_error(kleidicv_thread_rgb_to_yuv_u8(
         reinterpret_cast<const uint8_t *>(src_data), src_step,
         reinterpret_cast<uint8_t *>(dst_data), dst_step,
-        static_cast<size_t>(width), static_cast<size_t>(height)));
+        static_cast<size_t>(width), static_cast<size_t>(height), mt));
   }
 
   if (scn == 4) {
     if (is_bgr) {
-      return convert_error(kleidicv_bgra_to_yuv_u8(
+      return convert_error(kleidicv_thread_bgra_to_yuv_u8(
           reinterpret_cast<const uint8_t *>(src_data), src_step,
           reinterpret_cast<uint8_t *>(dst_data), dst_step,
-          static_cast<size_t>(width), static_cast<size_t>(height)));
+          static_cast<size_t>(width), static_cast<size_t>(height), mt));
     }
-    return convert_error(kleidicv_rgba_to_yuv_u8(
+    return convert_error(kleidicv_thread_rgba_to_yuv_u8(
         reinterpret_cast<const uint8_t *>(src_data), src_step,
         reinterpret_cast<uint8_t *>(dst_data), dst_step,
-        static_cast<size_t>(width), static_cast<size_t>(height)));
+        static_cast<size_t>(width), static_cast<size_t>(height), mt));
   }
 
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -239,12 +251,15 @@ int bgr_to_yuv(const uchar *src_data, size_t src_step, uchar *dst_data,
 int threshold(const uchar *src_data, size_t src_step, uchar *dst_data,
               size_t dst_step, int width, int height, int depth, int cn,
               double thresh, double maxValue, int thresholdType) {
+  auto mt = get_multithreading();
+
   if ((depth == CV_8U) && (thresholdType == 0 /* THRESH_BINARY */)) {
     size_t width_in_elements = width * cn;
-    return convert_error(kleidicv_threshold_binary_u8(
+    return convert_error(kleidicv_thread_threshold_binary_u8(
         reinterpret_cast<const uint8_t *>(src_data), src_step,
         reinterpret_cast<uint8_t *>(dst_data), dst_step, width_in_elements,
-        height, static_cast<uint8_t>(thresh), static_cast<uint8_t>(maxValue)));
+        height, static_cast<uint8_t>(thresh), static_cast<uint8_t>(maxValue),
+        mt));
   }
 
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -929,6 +944,8 @@ int min_max_idx(const uchar *src_data, size_t src_step, int width, int height,
 int convertTo(const uchar *src_data, size_t src_step, int src_depth,
               uchar *dst_data, size_t dst_step, int dst_depth, int width,
               int height, double scale, double shift) {
+  auto mt = get_multithreading();
+
   // scaling only
   if (src_depth == dst_depth) {
     // no scaling, no advantage
@@ -939,15 +956,15 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
 
     switch (src_depth) {
       case CV_8U:
-        return convert_error(kleidicv_scale_u8(
+        return convert_error(kleidicv_thread_scale_u8(
             reinterpret_cast<const uint8_t *>(src_data), src_step,
             reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height,
-            static_cast<float>(scale), static_cast<float>(shift)));
+            static_cast<float>(scale), static_cast<float>(shift), mt));
       case CV_32F:
-        return convert_error(kleidicv_scale_f32(
+        return convert_error(kleidicv_thread_scale_f32(
             reinterpret_cast<const float *>(src_data), src_step,
             reinterpret_cast<float *>(dst_data), dst_step, width, height,
-            static_cast<float>(scale), static_cast<float>(shift)));
+            static_cast<float>(scale), static_cast<float>(shift), mt));
       default:
         break;
     }
@@ -957,45 +974,49 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
   if (scale == 1.0 && shift == 0.0) {
     // float32 to int8
     if (src_depth == CV_32F && dst_depth == CV_8S) {
-      return convert_error(kleidicv_float_conversion_f32_s8(
+      return convert_error(kleidicv_thread_float_conversion_f32_s8(
           reinterpret_cast<const float *>(src_data), src_step,
-          reinterpret_cast<int8_t *>(dst_data), dst_step, width, height));
+          reinterpret_cast<int8_t *>(dst_data), dst_step, width, height, mt));
     }
     // float32 to uint8
     if (src_depth == CV_32F && dst_depth == CV_8U) {
-      return convert_error(kleidicv_float_conversion_f32_u8(
+      return convert_error(kleidicv_thread_float_conversion_f32_u8(
           reinterpret_cast<const float *>(src_data), src_step,
-          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height));
+          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height, mt));
     }
     // int8 to float32
     if (src_depth == CV_8S && dst_depth == CV_32F) {
-      return convert_error(kleidicv_float_conversion_s8_f32(
+      return convert_error(kleidicv_thread_float_conversion_s8_f32(
           reinterpret_cast<const int8_t *>(src_data), src_step,
-          reinterpret_cast<float *>(dst_data), dst_step, width, height));
+          reinterpret_cast<float *>(dst_data), dst_step, width, height, mt));
     }
     // uint8 to float32
     if (src_depth == CV_8U && dst_depth == CV_32F) {
-      return convert_error(kleidicv_float_conversion_u8_f32(
+      return convert_error(kleidicv_thread_float_conversion_u8_f32(
           reinterpret_cast<const uint8_t *>(src_data), src_step,
-          reinterpret_cast<float *>(dst_data), dst_step, width, height));
+          reinterpret_cast<float *>(dst_data), dst_step, width, height, mt));
     }
   }
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
 int exp32f(const float *src, float *dst, int len) {
-  return convert_error(kleidicv_exp_f32(src, len * sizeof(float), dst,
-                                        len * sizeof(float), len, 1));
+  auto mt = get_multithreading();
+
+  return convert_error(kleidicv_thread_exp_f32(
+      src, len * sizeof(float), dst, len * sizeof(float), len, 1, mt));
 }
 
 int compare_u8(const uchar *src1_data, size_t src1_step, const uchar *src2_data,
                size_t src2_step, uchar *dst_data, size_t dst_step, int width,
                int height, int operation) {
+  auto mt = get_multithreading();
+
   switch (operation) {
     case cv::CMP_GT:
-      return convert_error(kleidicv_compare_greater_u8(
+      return convert_error(kleidicv_thread_compare_greater_u8(
           src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width,
-          height));
+          height, mt));
     default:
       return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 6cc6af83171172293441c31a572c82d307f15cf8..ea6beaf857fdc6839035c2ce957c25026202fb50 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -9,6 +9,7 @@
 #include <type_traits>
 
 #include "kleidicv/kleidicv.h"
+#include "kleidicv_thread/kleidicv_thread.h"
 #include "opencv2/core/hal/interface.h"
 
 // Forward declarations of OpenCV internals.
@@ -21,6 +22,8 @@ namespace hal {
 // Macros to shorten repeated code.
 #define KLEIDICV_HAL_API(api) (kleidicv::hal::api)
 
+kleidicv_thread_multithreading get_multithreading();
+
 int gray_to_bgr(const uchar *src_data, size_t src_step, uchar *dst_data,
                 size_t dst_step, int width, int height, int depth, int dcn);
 
@@ -137,9 +140,11 @@ namespace cv {
 // If the KleidiCV function has a signature matching the OpenCV HAL interface
 // AND it never returns KLEIDICV_NOT_IMPLEMENTED then we can call it directly
 // and convert the return code.
-#define KLEIDICV_HAL_FORWARD(kleidicv_impl, ...)               \
-  (kleidicv_impl(__VA_ARGS__) == KLEIDICV_OK ? CV_HAL_ERROR_OK \
-                                             : CV_HAL_ERROR_UNKNOWN)
+#define KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_impl, ...)               \
+  (kleidicv_thread_impl(__VA_ARGS__, kleidicv::hal::get_multithreading()) == \
+           KLEIDICV_OK                                                       \
+       ? CV_HAL_ERROR_OK                                                     \
+       : CV_HAL_ERROR_UNKNOWN)
 
 #define KLEIDICV_HAL_FALLBACK_FORWARD(kleidicv_impl, fallback_hal_impl, ...) \
   (KLEIDICV_HAL_API(kleidicv_impl)(__VA_ARGS__) == CV_HAL_ERROR_OK           \
@@ -436,63 +441,63 @@ static inline int kleidicv_compare_u8_with_fallback(
 
 // clang-format off
 #undef cv_hal_add8s
-#define cv_hal_add8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s8, __VA_ARGS__)
+#define cv_hal_add8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_s8, __VA_ARGS__)
 #undef cv_hal_add8u
-#define cv_hal_add8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u8, __VA_ARGS__)
+#define cv_hal_add8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_u8, __VA_ARGS__)
 #undef cv_hal_add16s
-#define cv_hal_add16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s16, __VA_ARGS__)
+#define cv_hal_add16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_s16, __VA_ARGS__)
 #undef cv_hal_add16u
-#define cv_hal_add16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u16, __VA_ARGS__)
+#define cv_hal_add16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_add_u16, __VA_ARGS__)
 
 #undef cv_hal_sub8s
-#define cv_hal_sub8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s8, __VA_ARGS__)
+#define cv_hal_sub8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_s8, __VA_ARGS__)
 #undef cv_hal_sub8u
-#define cv_hal_sub8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u8, __VA_ARGS__)
+#define cv_hal_sub8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_u8, __VA_ARGS__)
 #undef cv_hal_sub16s
-#define cv_hal_sub16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s16, __VA_ARGS__)
+#define cv_hal_sub16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_s16, __VA_ARGS__)
 #undef cv_hal_sub16u
-#define cv_hal_sub16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u16, __VA_ARGS__)
+#define cv_hal_sub16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_sub_u16, __VA_ARGS__)
 
 #undef cv_hal_absdiff8s
-#define cv_hal_absdiff8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s8, __VA_ARGS__)
+#define cv_hal_absdiff8s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_s8, __VA_ARGS__)
 #undef cv_hal_absdiff8u
-#define cv_hal_absdiff8u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u8, __VA_ARGS__)
+#define cv_hal_absdiff8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_u8, __VA_ARGS__)
 #undef cv_hal_absdiff16s
-#define cv_hal_absdiff16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s16, __VA_ARGS__)
+#define cv_hal_absdiff16s(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_s16, __VA_ARGS__)
 #undef cv_hal_absdiff16u
-#define cv_hal_absdiff16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u16, __VA_ARGS__)
+#define cv_hal_absdiff16u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_saturating_absdiff_u16, __VA_ARGS__)
 
 #undef cv_hal_and8u
-#define cv_hal_and8u(...) KLEIDICV_HAL_FORWARD(kleidicv_bitwise_and, __VA_ARGS__)
+#define cv_hal_and8u(...) KLEIDICV_THREAD_HAL_FORWARD(kleidicv_thread_bitwise_and, __VA_ARGS__)
 // clang-format on
 
-#define KLEIDICV_HAL_MUL(suffix, kleidicv_impl, T)                            \
-  static inline int kleidicv_##suffix##_with_fallback(                        \
-      const T *src_a, size_t src_a_stride, const T *src_b,                    \
-      size_t src_b_stride, T *dst, size_t dst_stride, size_t width,           \
-      size_t height, double scale) {                                          \
-    if (scale != 1.0) {                                                       \
-      return cv_hal_##suffix(src_a, src_a_stride, src_b, src_b_stride, dst,   \
-                             dst_stride, width, height, scale);               \
-    }                                                                         \
-    return KLEIDICV_HAL_FORWARD(kleidicv_impl, src_a, src_a_stride, src_b,    \
-                                src_b_stride, dst, dst_stride, width, height, \
-                                scale);                                       \
+#define KLEIDICV_HAL_MUL(suffix, kleidicv_impl, T)                           \
+  static inline int kleidicv_##suffix##_with_fallback(                       \
+      const T *src_a, size_t src_a_stride, const T *src_b,                   \
+      size_t src_b_stride, T *dst, size_t dst_stride, size_t width,          \
+      size_t height, double scale) {                                         \
+    if (scale != 1.0) {                                                      \
+      return cv_hal_##suffix(src_a, src_a_stride, src_b, src_b_stride, dst,  \
+                             dst_stride, width, height, scale);              \
+    }                                                                        \
+    return KLEIDICV_THREAD_HAL_FORWARD(kleidicv_impl, src_a, src_a_stride,   \
+                                       src_b, src_b_stride, dst, dst_stride, \
+                                       width, height, scale);                \
   }
 
-KLEIDICV_HAL_MUL(mul8u, kleidicv_saturating_multiply_u8, uint8_t);
+KLEIDICV_HAL_MUL(mul8u, kleidicv_thread_saturating_multiply_u8, uint8_t);
 #undef cv_hal_mul8u
 #define cv_hal_mul8u kleidicv_mul8u_with_fallback
 
-KLEIDICV_HAL_MUL(mul8s, kleidicv_saturating_multiply_s8, int8_t);
+KLEIDICV_HAL_MUL(mul8s, kleidicv_thread_saturating_multiply_s8, int8_t);
 #undef cv_hal_mul8s
 #define cv_hal_mul8s kleidicv_mul8s_with_fallback
 
-KLEIDICV_HAL_MUL(mul16u, kleidicv_saturating_multiply_u16, uint16_t);
+KLEIDICV_HAL_MUL(mul16u, kleidicv_thread_saturating_multiply_u16, uint16_t);
 #undef cv_hal_mul16u
 #define cv_hal_mul16u kleidicv_mul16u_with_fallback
 
-KLEIDICV_HAL_MUL(mul16s, kleidicv_saturating_multiply_s16, int16_t);
+KLEIDICV_HAL_MUL(mul16s, kleidicv_thread_saturating_multiply_s16, int16_t);
 #undef cv_hal_mul16s
 #define cv_hal_mul16s kleidicv_mul16s_with_fallback
 
diff --git a/doc/opencv.md b/doc/opencv.md
index 0d0b3de03ca7e7c10f579bbacbc67b2f8cac371d..4d840bf46428ce17fde1615b89f0dbc849fdae81 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -14,9 +14,19 @@ For details of building OpenCV with KleidiCV see [the build documentation](build
 
 For single-threaded use cases, enabling the KleidiCV OpenCV HAL is likely to
 provide a performance boost for the functions that it implements.
-Multithreading is planned for KleidiCV but at present it is single-threaded
-only. Therefore it is recommended to check the performance yourself in order
-to decide whether to enable KleidiCV in a multicore environment.
+The same is true for multi-threaded use cases, with the exception of the
+following functions:
+* dilate
+* erode
+* GaussianBlur
+* resize
+* sepFilter2D
+* Sobel
+* transpose
+Multithreading is planned for these functions in KleidiCV but at present they
+are single-threaded only. Therefore it is recommended to check the performance
+yourself in order to decide whether to enable KleidiCV in a multicore
+environment.
 
 ## Functionality in KleidiCV OpenCV HAL
 
diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
index 5a1f0f863d92cda55912e04524f898c1a34cc648..ae0c04e0b3606a511275ff681d81cedf37fb17ac 100644
--- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
+++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
@@ -46,15 +46,61 @@ typedef struct {
   void *parallel_data;
 } kleidicv_thread_multithreading;
 
+#define KLEIDICV_THREAD_UNARY_OP(name, src_type, dst_type)                     \
+  kleidicv_error_t name(const src_type *src, size_t src_stride, dst_type *dst, \
+                        size_t dst_stride, size_t width, size_t height,        \
+                        kleidicv_thread_multithreading)
+
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_gray_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_gray_to_rgba_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_bgr_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_bgra_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_rgba_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_bgra_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_rgba_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_bgr_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_yuv_to_bgr_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_yuv_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_bgr_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgb_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_bgra_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_rgba_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_exp_f32, float, float);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_f32_s8, float,
+                         int8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_f32_u8, float,
+                         uint8_t);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_s8_f32, int8_t,
+                         float);
+KLEIDICV_THREAD_UNARY_OP(kleidicv_thread_float_conversion_u8_f32, uint8_t,
+                         float);
+
 /// Internal - not part of the public API and its direct use is not supported.
 ///
-/// Multithreaded implementation of kleidicv_yuv_sp_to_rgb_u8 - see the
+/// Multithreaded implementation of kleidicv_yuv_sp_to_bgr_u8 - see the
 /// documentation of that function for more details.
+kleidicv_error_t kleidicv_thread_yuv_sp_to_bgr_u8(
+    const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
+    size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
+    size_t height, bool is_nv21, kleidicv_thread_multithreading);
+
+kleidicv_error_t kleidicv_thread_yuv_sp_to_bgra_u8(
+    const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
+    size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
+    size_t height, bool is_nv21, kleidicv_thread_multithreading);
+
 kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8(
     const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
     size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
     size_t height, bool is_nv21, kleidicv_thread_multithreading);
 
+kleidicv_error_t kleidicv_thread_yuv_sp_to_rgba_u8(
+    const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
+    size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
+    size_t height, bool is_nv21, kleidicv_thread_multithreading);
+
 /// Internal - not part of the public API and its direct use is not supported.
 ///
 /// Multithreaded implementation of kleidicv_min_max_u8 - see the
@@ -123,6 +169,75 @@ kleidicv_error_t kleidicv_thread_min_max_loc_u8(
     const uint8_t *src, size_t src_stride, size_t width, size_t height,
     size_t *min_offset, size_t *max_offset, kleidicv_thread_multithreading);
 
+kleidicv_error_t kleidicv_thread_threshold_binary_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, uint8_t threshold, uint8_t value,
+    kleidicv_thread_multithreading);
+
+kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride,
+                                          uint8_t *dst, size_t dst_stride,
+                                          size_t width, size_t height,
+                                          float scale, float shift,
+                                          kleidicv_thread_multithreading);
+
+kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride,
+                                           float *dst, size_t dst_stride,
+                                           size_t width, size_t height,
+                                           float scale, float shift,
+                                           kleidicv_thread_multithreading);
+
+#define KLEIDICV_THREAD_BINARY_OP(name, type)                              \
+  kleidicv_error_t name(const type *src_a, size_t src_a_stride,            \
+                        const type *src_b, size_t src_b_stride, type *dst, \
+                        size_t dst_stride, size_t width, size_t height,    \
+                        kleidicv_thread_multithreading)
+
+#define KLEIDICV_THREAD_BINARY_OP_SCALE(name, type, scaletype)             \
+  kleidicv_error_t name(const type *src_a, size_t src_a_stride,            \
+                        const type *src_b, size_t src_b_stride, type *dst, \
+                        size_t dst_stride, size_t width, size_t height,    \
+                        scaletype scale, kleidicv_thread_multithreading)
+
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s8, int8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s16, int16_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u16, uint16_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s32, int32_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u32, uint32_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_s64, int64_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_add_u64, uint64_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s8, int8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s16, int16_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u16, uint16_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s32, int32_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u32, uint32_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_s64, int64_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_sub_u64, uint64_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_s8, int8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_u16, uint16_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_s16, int16_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_saturating_absdiff_s32, int32_t);
+KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_u8, uint8_t,
+                                double);
+KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_s8, int8_t,
+                                double);
+KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_u16,
+                                uint16_t, double);
+KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_s16,
+                                int16_t, double);
+KLEIDICV_THREAD_BINARY_OP_SCALE(kleidicv_thread_saturating_multiply_s32,
+                                int32_t, double);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_bitwise_and, uint8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_compare_equal_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_compare_greater_u8, uint8_t);
+
+kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16(
+    const int16_t *src_a, size_t src_a_stride, const int16_t *src_b,
+    size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width,
+    size_t height, int16_t threshold, kleidicv_thread_multithreading);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index 96de2d333eb1bdd606027bdc223b7d13e123725f..70107a5091bc699f2848712eae08c41df3ee0947 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -5,50 +5,201 @@
 #include "kleidicv_thread/kleidicv_thread.h"
 
 #include <algorithm>
+#include <functional>
 #include <limits>
 #include <vector>
 
 #include "kleidicv/kleidicv.h"
 
-struct kleidicv_thread_yuv_sp_to_rgb_u8_data {
-  const uint8_t *src_y;
-  size_t src_y_stride;
-  const uint8_t *src_uv;
-  size_t src_uv_stride;
-  uint8_t *dst;
-  size_t dst_stride;
-  size_t width;
-  size_t height;
-  bool is_nv21;
-};
+typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback;
 
-static kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_callback(
-    unsigned task_begin, unsigned task_end, void *void_data) {
-  auto *data =
-      reinterpret_cast<kleidicv_thread_yuv_sp_to_rgb_u8_data *>(void_data);
+static kleidicv_error_t kleidicv_thread_std_function_callback(
+    unsigned task_begin, unsigned task_end, void *data) {
+  auto *callback = reinterpret_cast<FunctionCallback *>(data);
+  return (*callback)(task_begin, task_end);
+}
+
+template <typename SrcT, typename DstT, typename F, typename... Args>
+inline kleidicv_error_t kleidicv_thread_unary_op_impl(
+    F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride,
+    DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) {
+  FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) {
+    return f(src + task_begin * src_stride / sizeof(SrcT), src_stride,
+             dst + task_begin * dst_stride / sizeof(DstT), dst_stride, width,
+             task_end - task_begin, args...);
+  };
+  return mt.parallel(kleidicv_thread_std_function_callback, &callback,
+                     mt.parallel_data, height);
+}
 
-  size_t row_begin = size_t{task_begin} * 2;
-  size_t row_end = std::min<size_t>(data->height, size_t{task_end} * 2);
-  size_t row_uv = task_begin;
+template <typename SrcT, typename DstT, typename F, typename... Args>
+inline kleidicv_error_t kleidicv_thread_binary_op_impl(
+    F f, kleidicv_thread_multithreading mt, const SrcT *src_a,
+    size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst,
+    size_t dst_stride, size_t width, size_t height, Args... args) {
+  FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) {
+    return f(src_a + task_begin * src_a_stride / sizeof(SrcT), src_a_stride,
+             src_b + task_begin * src_b_stride / sizeof(SrcT), src_b_stride,
+             dst + task_begin * dst_stride / sizeof(DstT), dst_stride, width,
+             task_end - task_begin, args...);
+  };
+  return mt.parallel(kleidicv_thread_std_function_callback, &callback,
+                     mt.parallel_data, height);
+}
+
+#define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type)            \
+  kleidicv_error_t kleidicv_thread_##suffix(                                 \
+      const src_type *src, size_t src_stride, dst_type *dst,                 \
+      size_t dst_stride, size_t width, size_t height,                        \
+      kleidicv_thread_multithreading mt) {                                   \
+    return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src,         \
+                                         src_stride, dst, dst_stride, width, \
+                                         height);                            \
+  }
+
+KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgr_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgb_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(bgr_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(bgra_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_yuv_u8, uint8_t, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float);
+KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_f32_s8, float, int8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_f32_u8, float, uint8_t);
+KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_s8_f32, int8_t, float);
+KLEIDICV_THREAD_UNARY_OP_IMPL(float_conversion_u8_f32, uint8_t, float);
+
+kleidicv_error_t kleidicv_thread_threshold_binary_u8(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, uint8_t threshold, uint8_t value,
+    kleidicv_thread_multithreading mt) {
+  return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src,
+                                       src_stride, dst, dst_stride, width,
+                                       height, threshold, value);
+}
 
-  return kleidicv_yuv_sp_to_rgb_u8(
-      data->src_y + row_begin * data->src_y_stride, data->src_y_stride,
-      data->src_uv + row_uv * data->src_uv_stride, data->src_uv_stride,
-      data->dst + row_begin * data->dst_stride, data->dst_stride, data->width,
-      row_end - row_begin, data->is_nv21);
+kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride,
+                                          uint8_t *dst, size_t dst_stride,
+                                          size_t width, size_t height,
+                                          float scale, float shift,
+                                          kleidicv_thread_multithreading mt) {
+  return kleidicv_thread_unary_op_impl(kleidicv_scale_u8, mt, src, src_stride,
+                                       dst, dst_stride, width, height, scale,
+                                       shift);
 }
 
-kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8(
-    const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
+kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride,
+                                           float *dst, size_t dst_stride,
+                                           size_t width, size_t height,
+                                           float scale, float shift,
+                                           kleidicv_thread_multithreading mt) {
+  return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride,
+                                       dst, dst_stride, width, height, scale,
+                                       shift);
+}
+
+#define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type)                         \
+  kleidicv_error_t kleidicv_thread_##suffix(                                 \
+      const type *src_a, size_t src_a_stride, const type *src_b,             \
+      size_t src_b_stride, type *dst, size_t dst_stride, size_t width,       \
+      size_t height, kleidicv_thread_multithreading mt) {                    \
+    return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a,      \
+                                          src_a_stride, src_b, src_b_stride, \
+                                          dst, dst_stride, width, height);   \
+  }
+
+#define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype)         \
+  kleidicv_error_t kleidicv_thread_##suffix(                                  \
+      const type *src_a, size_t src_a_stride, const type *src_b,              \
+      size_t src_b_stride, type *dst, size_t dst_stride, size_t width,        \
+      size_t height, scaletype scale, kleidicv_thread_multithreading mt) {    \
+    return kleidicv_thread_binary_op_impl(                                    \
+        kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \
+        dst_stride, width, height, scale);                                    \
+  }
+
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t);
+KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double);
+KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double);
+KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double);
+KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double);
+KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double);
+KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t);
+KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t);
+
+kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16(
+    const int16_t *src_a, size_t src_a_stride, const int16_t *src_b,
+    size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width,
+    size_t height, int16_t threshold, kleidicv_thread_multithreading mt) {
+  return kleidicv_thread_binary_op_impl(
+      kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride,
+      src_b, src_b_stride, dst, dst_stride, width, height, threshold);
+}
+
+template <typename F>
+inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl(
+    F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
     size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
     size_t height, bool is_nv21, kleidicv_thread_multithreading mt) {
-  kleidicv_thread_yuv_sp_to_rgb_u8_data callback_data = {
-      src_y,      src_y_stride, src_uv, src_uv_stride, dst,
-      dst_stride, width,        height, is_nv21};
-  return mt.parallel(kleidicv_thread_yuv_sp_to_rgb_u8_callback, &callback_data,
+  FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) {
+    size_t row_begin = size_t{task_begin} * 2;
+    size_t row_end = std::min<size_t>(height, size_t{task_end} * 2);
+    size_t row_uv = task_begin;
+    return f(src_y + row_begin * src_y_stride, src_y_stride,
+             src_uv + row_uv * src_uv_stride, src_uv_stride,
+             dst + row_begin * dst_stride, dst_stride, width,
+             row_end - row_begin, is_nv21);
+  };
+  return mt.parallel(kleidicv_thread_std_function_callback, &callback,
                      mt.parallel_data, (height + 1) / 2);
 }
 
+#define YUV_SP_TO_RGB(suffix)                                               \
+  kleidicv_error_t kleidicv_thread_##suffix(                                \
+      const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,     \
+      size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,  \
+      size_t height, bool is_nv21, kleidicv_thread_multithreading mt) {     \
+    return kleidicv_thread_yuv_sp_to_rgb_u8_impl(                           \
+        kleidicv_##suffix, src_y, src_y_stride, src_uv, src_uv_stride, dst, \
+        dst_stride, width, height, is_nv21, mt);                            \
+  }
+
+YUV_SP_TO_RGB(yuv_sp_to_bgr_u8);
+YUV_SP_TO_RGB(yuv_sp_to_bgra_u8);
+YUV_SP_TO_RGB(yuv_sp_to_rgb_u8);
+YUV_SP_TO_RGB(yuv_sp_to_rgba_u8);
+
 template <typename ScalarType, typename FunctionType>
 struct parallel_min_max_data {
   FunctionType min_max_func;
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69d5534d4019247c0ddcd8d9d345bb1a83359d18
--- /dev/null
+++ b/test/api/test_thread.cpp
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <thread>
+
+#include "framework/array.h"
+#include "framework/generator.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv_thread/kleidicv_thread.h"
+#include "multithreading_fake.h"
+
+// Tuple of width, height, thread count.
+typedef std::tuple<unsigned, unsigned, unsigned> P;
+
+class Thread : public testing::TestWithParam<P> {
+ public:
+  template <typename SrcT, typename DstT, typename SingleThreadedFunc,
+            typename MultithreadedFunc, typename... Args>
+  void check_unary_op(SingleThreadedFunc single_threaded_func,
+                      MultithreadedFunc multithreaded_func, size_t src_channels,
+                      size_t dst_channels, Args... args) {
+    unsigned width = 0, height = 0, thread_count = 0;
+    std::tie(width, height, thread_count) = GetParam();
+    test::Array2D<SrcT> src(size_t{width} * src_channels, height);
+    test::Array2D<DstT> dst_single(size_t{width} * dst_channels, height),
+        dst_multi(size_t{width} * dst_channels, height);
+
+    test::PseudoRandomNumberGenerator<SrcT> generator;
+    src.fill(generator);
+
+    kleidicv_error_t single_result =
+        single_threaded_func(src.data(), src.stride(), dst_single.data(),
+                             dst_single.stride(), width, height, args...);
+
+    kleidicv_error_t multi_result = multithreaded_func(
+        src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width,
+        height, args..., get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_OK, single_result);
+    EXPECT_EQ(KLEIDICV_OK, multi_result);
+    EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+  }
+
+  template <typename SrcT, typename DstT, typename SingleThreadedFunc,
+            typename MultithreadedFunc, typename... Args>
+  void check_binary_op(SingleThreadedFunc single_threaded_func,
+                       MultithreadedFunc multithreaded_func,
+                       size_t src_channels, size_t dst_channels, Args... args) {
+    unsigned width = 0, height = 0, thread_count = 0;
+    std::tie(width, height, thread_count) = GetParam();
+    test::Array2D<SrcT> src_a(size_t{width} * src_channels, height),
+        src_b(size_t{width} * src_channels, height);
+    test::Array2D<DstT> dst_single(size_t{width} * dst_channels, height),
+        dst_multi(size_t{width} * dst_channels, height);
+
+    test::PseudoRandomNumberGenerator<SrcT> generator;
+    src_a.fill(generator);
+    src_b.fill(generator);
+
+    kleidicv_error_t single_result = single_threaded_func(
+        src_a.data(), src_a.stride(), src_b.data(), src_b.stride(),
+        dst_single.data(), dst_single.stride(), width, height, args...);
+
+    kleidicv_error_t multi_result = multithreaded_func(
+        src_a.data(), src_a.stride(), src_b.data(), src_b.stride(),
+        dst_multi.data(), dst_multi.stride(), width, height, args...,
+        get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_OK, single_result);
+    EXPECT_EQ(KLEIDICV_OK, multi_result);
+    EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+  }
+};
+
+#define TEST_UNARY_OP(suffix, SrcT, DstT, ...)                              \
+  TEST_P(Thread, suffix) {                                                  \
+    check_unary_op<SrcT, DstT>(kleidicv_##suffix, kleidicv_thread_##suffix, \
+                               __VA_ARGS__);                                \
+  }
+
+#define TEST_BINARY_OP(suffix, T, ...)                                 \
+  TEST_P(Thread, suffix) {                                             \
+    check_binary_op<T, T>(kleidicv_##suffix, kleidicv_thread_##suffix, \
+                          __VA_ARGS__);                                \
+  }
+
+TEST_UNARY_OP(gray_to_rgb_u8, uint8_t, uint8_t, 1, 3);
+TEST_UNARY_OP(gray_to_rgba_u8, uint8_t, uint8_t, 1, 4);
+TEST_UNARY_OP(rgb_to_bgr_u8, uint8_t, uint8_t, 3, 3);
+TEST_UNARY_OP(rgb_to_rgb_u8, uint8_t, uint8_t, 3, 3);
+TEST_UNARY_OP(rgba_to_bgra_u8, uint8_t, uint8_t, 4, 4);
+TEST_UNARY_OP(rgba_to_rgba_u8, uint8_t, uint8_t, 4, 4);
+TEST_UNARY_OP(rgb_to_bgra_u8, uint8_t, uint8_t, 3, 4);
+TEST_UNARY_OP(rgb_to_rgba_u8, uint8_t, uint8_t, 3, 4);
+TEST_UNARY_OP(rgba_to_bgr_u8, uint8_t, uint8_t, 4, 3);
+TEST_UNARY_OP(rgba_to_rgb_u8, uint8_t, uint8_t, 4, 3);
+TEST_UNARY_OP(yuv_to_bgr_u8, uint8_t, uint8_t, 3, 3);
+TEST_UNARY_OP(yuv_to_rgb_u8, uint8_t, uint8_t, 3, 3);
+TEST_UNARY_OP(bgr_to_yuv_u8, uint8_t, uint8_t, 3, 3);
+TEST_UNARY_OP(rgb_to_yuv_u8, uint8_t, uint8_t, 3, 3);
+TEST_UNARY_OP(bgra_to_yuv_u8, uint8_t, uint8_t, 4, 3);
+TEST_UNARY_OP(rgba_to_yuv_u8, uint8_t, uint8_t, 4, 3);
+TEST_UNARY_OP(threshold_binary_u8, uint8_t, uint8_t, 1, 1, 100, 200);
+TEST_UNARY_OP(scale_u8, uint8_t, uint8_t, 1, 1, 0.5F, 3.5F);
+TEST_UNARY_OP(scale_f32, float, float, 1, 1, 0.123F, 45.6789F);
+TEST_UNARY_OP(exp_f32, float, float, 1, 1);
+TEST_UNARY_OP(float_conversion_f32_s8, float, int8_t, 1, 1);
+TEST_UNARY_OP(float_conversion_f32_u8, float, uint8_t, 1, 1);
+TEST_UNARY_OP(float_conversion_s8_f32, int8_t, float, 1, 1);
+TEST_UNARY_OP(float_conversion_u8_f32, uint8_t, float, 1, 1);
+
+TEST_BINARY_OP(saturating_add_s8, int8_t, 1, 1);
+TEST_BINARY_OP(saturating_add_u8, uint8_t, 1, 1);
+TEST_BINARY_OP(saturating_add_s16, int16_t, 1, 1);
+TEST_BINARY_OP(saturating_add_u16, uint16_t, 1, 1);
+TEST_BINARY_OP(saturating_add_s32, int32_t, 1, 1);
+TEST_BINARY_OP(saturating_add_u32, uint32_t, 1, 1);
+TEST_BINARY_OP(saturating_add_s64, int64_t, 1, 1);
+TEST_BINARY_OP(saturating_add_u64, uint64_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_s8, int8_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_u8, uint8_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_s16, int16_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_u16, uint16_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_s32, int32_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_u32, uint32_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_s64, int64_t, 1, 1);
+TEST_BINARY_OP(saturating_sub_u64, uint64_t, 1, 1);
+TEST_BINARY_OP(saturating_absdiff_u8, uint8_t, 1, 1);
+TEST_BINARY_OP(saturating_absdiff_s8, int8_t, 1, 1);
+TEST_BINARY_OP(saturating_absdiff_u16, uint16_t, 1, 1);
+TEST_BINARY_OP(saturating_absdiff_s16, int16_t, 1, 1);
+TEST_BINARY_OP(saturating_absdiff_s32, int32_t, 1, 1);
+TEST_BINARY_OP(saturating_multiply_u8, uint8_t, 1, 1, 1.23);
+TEST_BINARY_OP(saturating_multiply_s8, int8_t, 1, 1, -2.34);
+TEST_BINARY_OP(saturating_multiply_u16, uint16_t, 1, 1, 0.321);
+TEST_BINARY_OP(saturating_multiply_s16, int16_t, 1, 1, -0.543);
+TEST_BINARY_OP(saturating_multiply_s32, int32_t, 1, 1, -0.0123);
+TEST_BINARY_OP(bitwise_and, uint8_t, 1, 1);
+TEST_BINARY_OP(compare_equal_u8, uint8_t, 1, 1);
+TEST_BINARY_OP(compare_greater_u8, uint8_t, 1, 1);
+TEST_BINARY_OP(saturating_add_abs_with_threshold_s16, int16_t, 1, 1, 123);
+
+INSTANTIATE_TEST_SUITE_P(, Thread,
+                         testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2},
+                                         P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2},
+                                         P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2},
+                                         P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5}));
diff --git a/test/api/test_thread_yuv_sp_to_rgb.cpp b/test/api/test_thread_yuv_sp_to_rgb.cpp
index 1f3731df25f28cb4083341215c7f397fe513181a..a358edbd6b53797ab6419549d71b6d8c9d8bf7ed 100644
--- a/test/api/test_thread_yuv_sp_to_rgb.cpp
+++ b/test/api/test_thread_yuv_sp_to_rgb.cpp
@@ -16,35 +16,51 @@
 // Tuple of width, height, thread count.
 typedef std::tuple<unsigned, unsigned, unsigned> P;
 
-class Thread : public testing::TestWithParam<P> {};
-
-TEST_P(Thread, _) {
-  unsigned width = 0, height = 0, thread_count = 0;
-  std::tie(width, height, thread_count) = GetParam();
-  test::Array2D<uint8_t> src_y(width, height),
-      src_uv((width + 1) & ~1, (height + 1) / 2),
-      dst_single(size_t{width} * 3, height),
-      dst_multi(size_t{width} * 3, height);
-
-  test::PseudoRandomNumberGenerator<uint8_t> generator;
-  src_y.fill(generator);
-  src_uv.fill(generator);
-
-  kleidicv_error_t single_result = kleidicv_yuv_sp_to_rgb_u8(
-      src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
-      dst_single.data(), dst_single.stride(), width, height, false);
-
-  kleidicv_error_t multi_result = kleidicv_thread_yuv_sp_to_rgb_u8(
-      src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
-      dst_multi.data(), dst_multi.stride(), width, height, false,
-      get_multithreading_fake(thread_count));
-
-  EXPECT_EQ(KLEIDICV_OK, single_result);
-  EXPECT_EQ(KLEIDICV_OK, multi_result);
-  EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+class YuvSpThread : public testing::TestWithParam<P> {
+ public:
+  template <typename SingleThreadedFunc, typename MultithreadedFunc>
+  void check(SingleThreadedFunc single_threaded_func,
+             MultithreadedFunc multithreaded_func, size_t channels) {
+    unsigned width = 0, height = 0, thread_count = 0;
+    std::tie(width, height, thread_count) = GetParam();
+    test::Array2D<uint8_t> src_y(width, height),
+        src_uv((width + 1) & ~1, (height + 1) / 2),
+        dst_single(size_t{width} * channels, height),
+        dst_multi(size_t{width} * channels, height);
+
+    test::PseudoRandomNumberGenerator<uint8_t> generator;
+    src_y.fill(generator);
+    src_uv.fill(generator);
+
+    kleidicv_error_t single_result = single_threaded_func(
+        src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
+        dst_single.data(), dst_single.stride(), width, height, false);
+
+    kleidicv_error_t multi_result = multithreaded_func(
+        src_y.data(), src_y.stride(), src_uv.data(), src_uv.stride(),
+        dst_multi.data(), dst_multi.stride(), width, height, false,
+        get_multithreading_fake(thread_count));
+
+    EXPECT_EQ(KLEIDICV_OK, single_result);
+    EXPECT_EQ(KLEIDICV_OK, multi_result);
+    EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+  }
+};
+
+TEST_P(YuvSpThread, ToBGR) {
+  check(kleidicv_yuv_sp_to_bgr_u8, kleidicv_thread_yuv_sp_to_bgr_u8, 3);
+}
+TEST_P(YuvSpThread, ToBGRA) {
+  check(kleidicv_yuv_sp_to_bgra_u8, kleidicv_thread_yuv_sp_to_bgra_u8, 4);
+}
+TEST_P(YuvSpThread, ToRGB) {
+  check(kleidicv_yuv_sp_to_rgb_u8, kleidicv_thread_yuv_sp_to_rgb_u8, 3);
+}
+TEST_P(YuvSpThread, ToRGBA) {
+  check(kleidicv_yuv_sp_to_rgba_u8, kleidicv_thread_yuv_sp_to_rgba_u8, 4);
 }
 
-INSTANTIATE_TEST_SUITE_P(YuvSp, Thread,
+INSTANTIATE_TEST_SUITE_P(, YuvSpThread,
                          testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2},
                                          P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2},
                                          P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2},