From 24f7071262e864be32ce7802dd0714f829584277 Mon Sep 17 00:00:00 2001
From: Luna Lamb <luna.lamb@arm.com>
Date: Tue, 1 Apr 2025 13:35:03 +0000
Subject: [PATCH] Special multithreaded implementation for kleidicv_scale_u8

---
 adapters/opencv/kleidicv_hal.cpp              |   2 +-
 kleidicv/include/kleidicv/arithmetics/scale.h |  21 +++
 kleidicv/src/arithmetics/scale_neon.cpp       | 173 ++++++++++++------
 kleidicv_thread/src/kleidicv_thread.cpp       |  14 +-
 test/api/test_thread.cpp                      |  45 +++++
 5 files changed, 192 insertions(+), 63 deletions(-)
 create mode 100644 kleidicv/include/kleidicv/arithmetics/scale.h
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 8cbdeb19f..7a554af6c 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -40,7 +40,7 @@ enum {
   MULTITHREAD_MIN_ELEMENTS_RESIZE_TO_QUARTER_U8 = 150000,
   MULTITHREAD_MIN_ELEMENTS_RGB_TO_BGR_U8 = 180000,
   MULTITHREAD_MIN_ELEMENTS_RGBA_TO_BGRA_U8 = 11000,
-  MULTITHREAD_MIN_ELEMENTS_SCALE_U8 = 5000,
+  MULTITHREAD_MIN_ELEMENTS_SCALE_U8 = 13000,
   MULTITHREAD_MIN_ELEMENTS_SCALE_F32 = 20000,
   MULTITHREAD_MIN_ELEMENTS_ROTATE_U8 = 40000,
   MULTITHREAD_MIN_ELEMENTS_ROTATE_U16 = 30000,
diff --git a/kleidicv/include/kleidicv/arithmetics/scale.h b/kleidicv/include/kleidicv/arithmetics/scale.h
new file mode 100644
index 000000000..b41262bd4
--- /dev/null
+++ b/kleidicv/include/kleidicv/arithmetics/scale.h
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <array>
+
+#include "kleidicv/ctypes.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/traits.h"
+#include "kleidicv/types.h"
+
+namespace kleidicv::neon {
+
+std::array<uint8_t, 256> precalculate_scale_table_u8(float scale, float shift);
+
+kleidicv_error_t scale_with_precalculated_table(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, float scale, float shift,
+    const std::array<uint8_t, 256> &precalculated_table);
+
+}  // namespace kleidicv::neon
diff --git a/kleidicv/src/arithmetics/scale_neon.cpp b/kleidicv/src/arithmetics/scale_neon.cpp
index 60621cca9..3680259ce 100644
--- a/kleidicv/src/arithmetics/scale_neon.cpp
+++ b/kleidicv/src/arithmetics/scale_neon.cpp
@@ -4,9 +4,10 @@
 
 #include <climits>
 #include <cmath>
+#include <cstdint>
 #include <limits>
 
-#include "kleidicv/kleidicv.h"
+#include "kleidicv/arithmetics/scale.h"
 #include "kleidicv/neon.h"
 #include "kleidicv/traits.h"
 
@@ -61,18 +62,25 @@ class ScaleIntBase : public UnrollTwice {
  protected:
   static constexpr ScalarType ScalarMax =
       std::numeric_limits<ScalarType>::max();
-  inline ScalarType scale_value(ScalarType value) {
-    int64_t v = lrintf(value * scale_ + shift_);
-    if (static_cast<uint64_t>(v) <= ScalarMax) {
-      return static_cast<ScalarType>(v);
-    }
-    return static_cast<ScalarType>(v > 0 ? ScalarMax : 0);
-  }
 
- private:
   float scale_, shift_;
 };
 
+template <typename T>
+kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       float scale, float shift);
+
+template <typename T>
+T scale_value(T value, float scale, float shift) {
+  static constexpr T ScalarMax = std::numeric_limits<T>::max();
+  int64_t v = lrintf(static_cast<float>(value) * scale + shift);
+  if (static_cast<uint64_t>(v) <= ScalarMax) {
+    return static_cast<T>(v);
+  }
+  return static_cast<T>(v > 0 ? ScalarMax : 0);
+}
+
 class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
  public:
   using ScalarType = uint8_t;
@@ -81,24 +89,22 @@ class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
   using Vector2Type = typename VecTraits::Vector2Type;
   using Vector3Type = typename VecTraits::Vector3Type;
 
-  ScaleUint8Tbx(float scale, float shift)
-      : ScaleIntBase<uint8_t>(scale, shift) {
-    constexpr size_t TableLength = 1 << (CHAR_BIT * sizeof(ScalarType));
-    ScalarType values[TableLength];
-    for (size_t i = 0; i < TableLength; ++i) {
-      values[i] = this->scale_value(i);
-    }
-
-    VecTraits::load(values, t0_3_);
-    VecTraits::load(values + 3 * VecTraits::num_lanes(), t1_3_);
-    VecTraits::load(values + (3 + 3) * VecTraits::num_lanes(), t2_2_);
-    VecTraits::load(values + (3 + 3 + 2) * VecTraits::num_lanes(), t3_3_);
-    VecTraits::load(values + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
-    VecTraits::load(values + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
-                    t5_3_);
-
-    v_step3_ = vdupq_n_u8(3 * VecTraits::num_lanes());
-    v_step2_ = vdupq_n_u8(2 * VecTraits::num_lanes());
+  ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table)
+      : ScaleIntBase<uint8_t>(scale, shift),
+        table_pointer_(precalculated_table),
+        v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())),
+        v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) {
+    VecTraits::load(precalculated_table, t0_3_);
+    VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_);
+    VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(),
+                    t2_2_);
+    VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(),
+                    t3_3_);
+    VecTraits::load(
+        precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
+    VecTraits::load(
+        precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
+        t5_3_);
   }
   VectorType vector_path(VectorType src) {
     VectorType dst = vqtbl3q_u8(t0_3_, src);
@@ -115,9 +121,10 @@ class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
     return dst;
   }
 
-  ScalarType scalar_path(ScalarType src) { return this->scale_value(src); }
+  ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; }
 
  private:
+  const ScalarType *table_pointer_;
   Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{};
   Vector2Type t2_2_{}, t4_2_{};
   VectorType v_step3_, v_step2_;
@@ -153,7 +160,9 @@ class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
     return vqmovn_high_u16(vqmovn_u16(res1), res2);
   }
 
-  ScalarType scalar_path(ScalarType src) { return this->scale_value(src); }
+  ScalarType scalar_path(ScalarType src) {
+    return scale_value(src, scale_, shift_);
+  }
 
  private:
   static constexpr ScalarType FF = std::numeric_limits<uint8_t>::max();
@@ -174,6 +183,82 @@ class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
   float32x4_t vscale_, vshift_;
 };  // end of class ScaleUint8Calc<T>
 
+kleidicv_error_t scale_with_precalculated_table(
+    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
+    size_t width, size_t height, float scale, float shift,
+    const std::array<uint8_t, 256> &precalculated_table) {
+  Rectangle rect{width, height};
+  Rows<const uint8_t> src_rows{src, src_stride};
+  Rows<uint8_t> dst_rows{dst, dst_stride};
+  ScaleUint8Tbx operation(scale, shift, precalculated_table.data());
+  apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+
+  return KLEIDICV_OK;
+}
+
+// Specialization for uint8_t
+template <>
+kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       float scale, float shift) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
+  CHECK_IMAGE_SIZE(width, height);
+  // For smaller inputs, the full calculation is the faster
+  if (width * height < 675) {  // empirical value
+    Rectangle rect{width, height};
+    Rows<const uint8_t> src_rows{src, src_stride};
+    Rows<uint8_t> dst_rows{dst, dst_stride};
+    ScaleUint8Calc operation(scale, shift);
+    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  } else {
+    // For bigger inputs, it's faster to pre-calculate the table
+    // and map those values during the run
+    auto precalculated_table = precalculate_scale_table_u8(scale, shift);
+    return scale_with_precalculated_table(src, src_stride, dst, dst_stride,
+                                          width, height, scale, shift,
+                                          precalculated_table);
+  }
+  return KLEIDICV_OK;
+}
+
+static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) {
+  float32x4_t fx = vcvtq_f32_u32(src);
+  float32x4_t max = vdupq_n_f32(255.0F);
+  float32x4_t min = vdupq_n_f32(0.0F);
+  float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale));
+  return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max)));
+}
+
+std::array<uint8_t, 256> precalculate_scale_table_u8(float scale, float shift) {
+  static constexpr size_t TableLength = 256;
+  std::array<uint8_t, TableLength> precalculated_table{};
+
+  uint32x4_t counter = {0, 1, 2, 3};
+  uint32x4_t four = vdupq_n_u32(4);
+
+  for (size_t i = 0; i < TableLength; i += 16) {
+    uint32x4_t res11 = scale_shift(counter, scale, shift);
+    counter = vaddq(counter, four);
+    uint32x4_t res12 = scale_shift(counter, scale, shift);
+    counter = vaddq(counter, four);
+    uint32x4_t res21 = scale_shift(counter, scale, shift);
+    counter = vaddq(counter, four);
+    uint32x4_t res22 = scale_shift(counter, scale, shift);
+    counter = vaddq(counter, four);
+
+    uint16x8_t res1 =
+        vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
+    uint16x8_t res2 =
+        vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
+    // Saturating narrowing from 16 to 8 bits
+    uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2);
+
+    vst1q_u8(&precalculated_table[i], res);
+  }
+  return precalculated_table;
+}
+
 // -----------------------------------------------------------------------
 // Float implementation
 // -----------------------------------------------------------------------
@@ -226,36 +311,6 @@ class ScaleFloat final : public UnrollTwice,
   float32x4_t vscale_, vshift_;
 };  // end of class ScaleFloat
 
-template <typename T>
-kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
-                       size_t dst_stride, size_t width, size_t height,
-                       float scale, float shift);
-
-// Specialization for uint8_t
-template <>
-kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst,
-                       size_t dst_stride, size_t width, size_t height,
-                       float scale, float shift) {
-  CHECK_POINTER_AND_STRIDE(src, src_stride, height);
-  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
-  CHECK_IMAGE_SIZE(width, height);
-
-  Rectangle rect{width, height};
-  Rows<const uint8_t> src_rows{src, src_stride};
-  Rows<uint8_t> dst_rows{dst, dst_stride};
-  // For smaller inputs, the full calculation is the faster
-  if (width * height < 2500) {  // empirical value
-    ScaleUint8Calc operation(scale, shift);
-    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
-  } else {
-    // For bigger inputs, it's faster to pre-calculate the table
-    // and map those values during the run
-    ScaleUint8Tbx operation(scale, shift);
-    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
-  }
-  return KLEIDICV_OK;
-}
-
 // Specialization for float
 template <>
 kleidicv_error_t scale(const float *src, size_t src_stride, float *dst,
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index f72010f92..e71b2a339 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -6,11 +6,13 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <limits>
 #include <vector>
 
 #include "kleidicv/arithmetics/rotate.h"
+#include "kleidicv/arithmetics/scale.h"
 #include "kleidicv/ctypes.h"
 #include "kleidicv/filters/blur_and_downsample.h"
 #include "kleidicv/filters/gaussian_blur.h"
@@ -147,9 +149,15 @@ kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride,
                                           size_t width, size_t height,
                                           float scale, float shift,
                                           kleidicv_thread_multithreading mt) {
-  return kleidicv_thread_unary_op_impl(kleidicv_scale_u8, mt, src, src_stride,
-                                       dst, dst_stride, width, height, scale,
-                                       shift);
+  CHECK_POINTER_AND_STRIDE(src, src_stride, height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
+  CHECK_IMAGE_SIZE(width, height);
+
+  const std::array<uint8_t, 256> precalculated_table =
+      kleidicv::neon::precalculate_scale_table_u8(scale, shift);
+  return kleidicv_thread_unary_op_impl(
+      kleidicv::neon::scale_with_precalculated_table, mt, src, src_stride, dst,
+      dst_stride, width, height, scale, shift, precalculated_table);
 }
 
 kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride,
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 30fa1405f..876768095 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -858,6 +858,51 @@ INSTANTIATE_TEST_SUITE_P(
                     P{2, 48, 1}, P{6, 64, 1}, P{4, 80, 2}, P{2, 96, 3},
                     P{1, 112, 4}, P{12, 34, 5}));
 
+TEST(ThreadedScaleU8, NotImplemented) {
+  test::Array2D<uint8_t> src(size_t{1}, 1), dst(size_t{1}, 1);
+  test::test_null_args(kleidicv_thread_scale_u8, src.data(), src.stride(),
+                       dst.data(), dst.stride(), 1, 1, 2, 0,
+                       get_multithreading_fake(2));
+}
+
+TEST(ThreadedScaleU8, OversizeImage) {
+  test::Array2D<uint8_t> src(size_t{1}, 1), dst(size_t{1}, 1);
+  kleidicv_error_t result = kleidicv_thread_scale_u8(
+      src.data(), src.stride(), dst.data(), dst.stride(),
+      KLEIDICV_MAX_IMAGE_PIXELS + 1, 2, 2, 0, get_multithreading_fake(2));
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE, result);
+}
+
+TEST(ThreadedScaleU8, ZerosizeImage) {
+  test::Array2D<uint8_t> src(size_t{1}, 1), dst(size_t{1}, 1);
+  kleidicv_error_t result = kleidicv_thread_scale_u8(
+      src.data(), src.stride(), dst.data(), dst.stride(), 0, 2, 2, 0,
+      get_multithreading_fake(2));
+  EXPECT_EQ(KLEIDICV_OK, result);
+}
+
+TEST(ThreadedScaleU8, Consistency) {
+  const auto width = 55;
+  const auto height = 60;
+  const uint8_t src_val = 230, scale = 2, shift = 3;
+  test::Array2D<uint8_t> src(size_t{width}, height),
+      dst_single(size_t{width}, height), dst_multi(size_t{width}, height);
+
+  src.fill(src_val);
+
+  kleidicv_error_t single_result =
+      kleidicv_scale_u8(src.data(), src.stride(), dst_single.data(),
+                        dst_single.stride(), width, height, scale, shift);
+
+  kleidicv_error_t multi_result = kleidicv_thread_scale_u8(
+      src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width,
+      height, scale, shift, get_multithreading_fake(2));
+
+  EXPECT_EQ(KLEIDICV_OK, multi_result);
+  EXPECT_EQ(KLEIDICV_OK, single_result);
+  EXPECT_EQ_ARRAY2D(dst_multi, dst_single);
+}
+
 // Operations in the Neon backend have both a vector path and a scalar path.
 // The vector path is used to process most data and the scalar path is used to
 // process the parts of the data that don't fit into the vector width.
-- 
GitLab