From 24f7071262e864be32ce7802dd0714f829584277 Mon Sep 17 00:00:00 2001 From: Luna Lamb Date: Tue, 1 Apr 2025 13:35:03 +0000 Subject: [PATCH] Special multithreaded implementation for kleidicv_scale_u8 --- adapters/opencv/kleidicv_hal.cpp | 2 +- kleidicv/include/kleidicv/arithmetics/scale.h | 21 +++ kleidicv/src/arithmetics/scale_neon.cpp | 173 ++++++++++++------ kleidicv_thread/src/kleidicv_thread.cpp | 14 +- test/api/test_thread.cpp | 45 +++++ 5 files changed, 192 insertions(+), 63 deletions(-) create mode 100644 kleidicv/include/kleidicv/arithmetics/scale.h diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 8cbdeb19f..7a554af6c 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -40,7 +40,7 @@ enum { MULTITHREAD_MIN_ELEMENTS_RESIZE_TO_QUARTER_U8 = 150000, MULTITHREAD_MIN_ELEMENTS_RGB_TO_BGR_U8 = 180000, MULTITHREAD_MIN_ELEMENTS_RGBA_TO_BGRA_U8 = 11000, - MULTITHREAD_MIN_ELEMENTS_SCALE_U8 = 5000, + MULTITHREAD_MIN_ELEMENTS_SCALE_U8 = 13000, MULTITHREAD_MIN_ELEMENTS_SCALE_F32 = 20000, MULTITHREAD_MIN_ELEMENTS_ROTATE_U8 = 40000, MULTITHREAD_MIN_ELEMENTS_ROTATE_U16 = 30000, diff --git a/kleidicv/include/kleidicv/arithmetics/scale.h b/kleidicv/include/kleidicv/arithmetics/scale.h new file mode 100644 index 000000000..b41262bd4 --- /dev/null +++ b/kleidicv/include/kleidicv/arithmetics/scale.h @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "kleidicv/ctypes.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/traits.h" +#include "kleidicv/types.h" + +namespace kleidicv::neon { + +std::array precalculate_scale_table_u8(float scale, float shift); + +kleidicv_error_t scale_with_precalculated_table( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, float scale, float shift, + const std::array &precalculated_table); + +} // namespace kleidicv::neon diff --git a/kleidicv/src/arithmetics/scale_neon.cpp b/kleidicv/src/arithmetics/scale_neon.cpp index 60621cca9..3680259ce 100644 --- a/kleidicv/src/arithmetics/scale_neon.cpp +++ b/kleidicv/src/arithmetics/scale_neon.cpp @@ -4,9 +4,10 @@ #include #include +#include #include -#include "kleidicv/kleidicv.h" +#include "kleidicv/arithmetics/scale.h" #include "kleidicv/neon.h" #include "kleidicv/traits.h" @@ -61,18 +62,25 @@ class ScaleIntBase : public UnrollTwice { protected: static constexpr ScalarType ScalarMax = std::numeric_limits::max(); - inline ScalarType scale_value(ScalarType value) { - int64_t v = lrintf(value * scale_ + shift_); - if (static_cast(v) <= ScalarMax) { - return static_cast(v); - } - return static_cast(v > 0 ? ScalarMax : 0); - } - private: float scale_, shift_; }; +template +kleidicv_error_t scale(const T *src, size_t src_stride, T *dst, + size_t dst_stride, size_t width, size_t height, + float scale, float shift); + +template +T scale_value(T value, float scale, float shift) { + static constexpr T ScalarMax = std::numeric_limits::max(); + int64_t v = lrintf(static_cast(value) * scale + shift); + if (static_cast(v) <= ScalarMax) { + return static_cast(v); + } + return static_cast(v > 0 ? ScalarMax : 0); +} + class ScaleUint8Tbx final : public ScaleIntBase { public: using ScalarType = uint8_t; @@ -81,24 +89,22 @@ class ScaleUint8Tbx final : public ScaleIntBase { using Vector2Type = typename VecTraits::Vector2Type; using Vector3Type = typename VecTraits::Vector3Type; - ScaleUint8Tbx(float scale, float shift) - : ScaleIntBase(scale, shift) { - constexpr size_t TableLength = 1 << (CHAR_BIT * sizeof(ScalarType)); - ScalarType values[TableLength]; - for (size_t i = 0; i < TableLength; ++i) { - values[i] = this->scale_value(i); - } - - VecTraits::load(values, t0_3_); - VecTraits::load(values + 3 * VecTraits::num_lanes(), t1_3_); - VecTraits::load(values + (3 + 3) * VecTraits::num_lanes(), t2_2_); - VecTraits::load(values + (3 + 3 + 2) * VecTraits::num_lanes(), t3_3_); - VecTraits::load(values + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_); - VecTraits::load(values + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(), - t5_3_); - - v_step3_ = vdupq_n_u8(3 * VecTraits::num_lanes()); - v_step2_ = vdupq_n_u8(2 * VecTraits::num_lanes()); + ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table) + : ScaleIntBase(scale, shift), + table_pointer_(precalculated_table), + v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())), + v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) { + VecTraits::load(precalculated_table, t0_3_); + VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_); + VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(), + t2_2_); + VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(), + t3_3_); + VecTraits::load( + precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_); + VecTraits::load( + precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(), + t5_3_); } VectorType vector_path(VectorType src) { VectorType dst = vqtbl3q_u8(t0_3_, src); @@ -115,9 +121,10 @@ class ScaleUint8Tbx final : public ScaleIntBase { return dst; } - ScalarType scalar_path(ScalarType src) { return this->scale_value(src); } + ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; } private: + const ScalarType *table_pointer_; Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{}; Vector2Type t2_2_{}, t4_2_{}; VectorType v_step3_, v_step2_; @@ -153,7 +160,9 @@ class ScaleUint8Calc final : public ScaleIntBase { return vqmovn_high_u16(vqmovn_u16(res1), res2); } - ScalarType scalar_path(ScalarType src) { return this->scale_value(src); } + ScalarType scalar_path(ScalarType src) { + return scale_value(src, scale_, shift_); + } private: static constexpr ScalarType FF = std::numeric_limits::max(); @@ -174,6 +183,82 @@ class ScaleUint8Calc final : public ScaleIntBase { float32x4_t vscale_, vshift_; }; // end of class ScaleUint8Calc +kleidicv_error_t scale_with_precalculated_table( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, float scale, float shift, + const std::array &precalculated_table) { + Rectangle rect{width, height}; + Rows src_rows{src, src_stride}; + Rows dst_rows{dst, dst_stride}; + ScaleUint8Tbx operation(scale, shift, precalculated_table.data()); + apply_operation_by_rows(operation, rect, src_rows, dst_rows); + + return KLEIDICV_OK; +} + +// Specialization for uint8_t +template <> +kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst, + size_t dst_stride, size_t width, size_t height, + float scale, float shift) { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + // For smaller inputs, the full calculation is the faster + if (width * height < 675) { // empirical value + Rectangle rect{width, height}; + Rows src_rows{src, src_stride}; + Rows dst_rows{dst, dst_stride}; + ScaleUint8Calc operation(scale, shift); + apply_operation_by_rows(operation, rect, src_rows, dst_rows); + } else { + // For bigger inputs, it's faster to pre-calculate the table + // and map those values during the run + auto precalculated_table = precalculate_scale_table_u8(scale, shift); + return scale_with_precalculated_table(src, src_stride, dst, dst_stride, + width, height, scale, shift, + precalculated_table); + } + return KLEIDICV_OK; +} + +static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) { + float32x4_t fx = vcvtq_f32_u32(src); + float32x4_t max = vdupq_n_f32(255.0F); + float32x4_t min = vdupq_n_f32(0.0F); + float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale)); + return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max))); +} + +std::array precalculate_scale_table_u8(float scale, float shift) { + static constexpr size_t TableLength = 256; + std::array precalculated_table{}; + + uint32x4_t counter = {0, 1, 2, 3}; + uint32x4_t four = vdupq_n_u32(4); + + for (size_t i = 0; i < TableLength; i += 16) { + uint32x4_t res11 = scale_shift(counter, scale, shift); + counter = vaddq(counter, four); + uint32x4_t res12 = scale_shift(counter, scale, shift); + counter = vaddq(counter, four); + uint32x4_t res21 = scale_shift(counter, scale, shift); + counter = vaddq(counter, four); + uint32x4_t res22 = scale_shift(counter, scale, shift); + counter = vaddq(counter, four); + + uint16x8_t res1 = + vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12)); + uint16x8_t res2 = + vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22)); + // Saturating narrowing from 16 to 8 bits + uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2); + + vst1q_u8(&precalculated_table[i], res); + } + return precalculated_table; +} + // ----------------------------------------------------------------------- // Float implementation // ----------------------------------------------------------------------- @@ -226,36 +311,6 @@ class ScaleFloat final : public UnrollTwice, float32x4_t vscale_, vshift_; }; // end of class ScaleFloat -template -kleidicv_error_t scale(const T *src, size_t src_stride, T *dst, - size_t dst_stride, size_t width, size_t height, - float scale, float shift); - -// Specialization for uint8_t -template <> -kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst, - size_t dst_stride, size_t width, size_t height, - float scale, float shift) { - CHECK_POINTER_AND_STRIDE(src, src_stride, height); - CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); - CHECK_IMAGE_SIZE(width, height); - - Rectangle rect{width, height}; - Rows src_rows{src, src_stride}; - Rows dst_rows{dst, dst_stride}; - // For smaller inputs, the full calculation is the faster - if (width * height < 2500) { // empirical value - ScaleUint8Calc operation(scale, shift); - apply_operation_by_rows(operation, rect, src_rows, dst_rows); - } else { - // For bigger inputs, it's faster to pre-calculate the table - // and map those values during the run - ScaleUint8Tbx operation(scale, shift); - apply_operation_by_rows(operation, rect, src_rows, dst_rows); - } - return KLEIDICV_OK; -} - // Specialization for float template <> kleidicv_error_t scale(const float *src, size_t src_stride, float *dst, diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index f72010f92..e71b2a339 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -6,11 +6,13 @@ #include #include +#include #include #include #include #include "kleidicv/arithmetics/rotate.h" +#include "kleidicv/arithmetics/scale.h" #include "kleidicv/ctypes.h" #include "kleidicv/filters/blur_and_downsample.h" #include "kleidicv/filters/gaussian_blur.h" @@ -147,9 +149,15 @@ kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride, size_t width, size_t height, float scale, float shift, kleidicv_thread_multithreading mt) { - return kleidicv_thread_unary_op_impl(kleidicv_scale_u8, mt, src, src_stride, - dst, dst_stride, width, height, scale, - shift); + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + + const std::array precalculated_table = + kleidicv::neon::precalculate_scale_table_u8(scale, shift); + return kleidicv_thread_unary_op_impl( + kleidicv::neon::scale_with_precalculated_table, mt, src, src_stride, dst, + dst_stride, width, height, scale, shift, precalculated_table); } kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride, diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 30fa1405f..876768095 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -858,6 +858,51 @@ INSTANTIATE_TEST_SUITE_P( P{2, 48, 1}, P{6, 64, 1}, P{4, 80, 2}, P{2, 96, 3}, P{1, 112, 4}, P{12, 34, 5})); +TEST(ThreadedScaleU8, NotImplemented) { + test::Array2D src(size_t{1}, 1), dst(size_t{1}, 1); + test::test_null_args(kleidicv_thread_scale_u8, src.data(), src.stride(), + dst.data(), dst.stride(), 1, 1, 2, 0, + get_multithreading_fake(2)); +} + +TEST(ThreadedScaleU8, OversizeImage) { + test::Array2D src(size_t{1}, 1), dst(size_t{1}, 1); + kleidicv_error_t result = kleidicv_thread_scale_u8( + src.data(), src.stride(), dst.data(), dst.stride(), + KLEIDICV_MAX_IMAGE_PIXELS + 1, 2, 2, 0, get_multithreading_fake(2)); + EXPECT_EQ(KLEIDICV_ERROR_RANGE, result); +} + +TEST(ThreadedScaleU8, ZerosizeImage) { + test::Array2D src(size_t{1}, 1), dst(size_t{1}, 1); + kleidicv_error_t result = kleidicv_thread_scale_u8( + src.data(), src.stride(), dst.data(), dst.stride(), 0, 2, 2, 0, + get_multithreading_fake(2)); + EXPECT_EQ(KLEIDICV_OK, result); +} + +TEST(ThreadedScaleU8, Consistency) { + const auto width = 55; + const auto height = 60; + const uint8_t src_val = 230, scale = 2, shift = 3; + test::Array2D src(size_t{width}, height), + dst_single(size_t{width}, height), dst_multi(size_t{width}, height); + + src.fill(src_val); + + kleidicv_error_t single_result = + kleidicv_scale_u8(src.data(), src.stride(), dst_single.data(), + dst_single.stride(), width, height, scale, shift); + + kleidicv_error_t multi_result = kleidicv_thread_scale_u8( + src.data(), src.stride(), dst_multi.data(), dst_multi.stride(), width, + height, scale, shift, get_multithreading_fake(2)); + + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ_ARRAY2D(dst_multi, dst_single); +} + // Operations in the Neon backend have both a vector path and a scalar path. // The vector path is used to process most data and the scalar path is used to // process the parts of the data that don't fit into the vector width. -- GitLab