diff --git a/CHANGELOG.md b/CHANGELOG.md index 4aa816a5c96065108f63fcff45350aa275d35677..a6d9730cdd505c565d1d635d1d605acca8c75f59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,12 @@ KleidiCV uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html). This changelog aims to follow the guiding principles of [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). + +## 0.4.0 - not yet released + +### Added +- Sum operation supports multithreading. + ## 0.3.0 - not yet released ### Added diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index f122f79bc9dcf014d1abb1118322bdbd9b1bcb07..56efaf54b65b40b5ffb25c07f1e49e22e9f66552 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -109,6 +109,15 @@ kleidicv_error_t kleidicv_thread_yuv_sp_to_rgba_u8( size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_sum_f32 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_sum_f32(const float *src, size_t src_stride, + size_t width, size_t height, + float *sum, + kleidicv_thread_multithreading); + /// Internal - not part of the public API and its direct use is not supported. /// /// Multithreaded implementation of kleidicv_min_max_u8 - see the diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 0c2ff42011f0f2b935044e61b57650198b0fca21..633af2780562e092e57bb580662a39c2619cf922 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -214,6 +214,40 @@ kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16( src_b, src_b_stride, dst, dst_stride, width, height, threshold); } +template +kleidicv_error_t parallel_sum(FunctionType sum_func, const ScalarType *src, + size_t src_stride, size_t width, size_t height, + ScalarType *sum, + kleidicv_thread_multithreading mt) { + std::vector sums(height, 0); + + auto callback = [&](unsigned begin, unsigned end) { + return sum_func(src + begin * (src_stride / sizeof(ScalarType)), src_stride, + width, end - begin, sums.data() + begin); + }; + + auto return_val = parallel_batches(callback, mt, height); + + if (return_val == KLEIDICV_OK) { + *sum = 0; + for (ScalarType s : sums) { + *sum += s; + } + } + + return return_val; +} + +#define DEFINE_SUM(suffix, type) \ + kleidicv_error_t kleidicv_thread_sum_##suffix( \ + const type *src, size_t src_stride, size_t width, size_t height, \ + type *sum, kleidicv_thread_multithreading mt) { \ + return parallel_sum(kleidicv_sum_##suffix, src, src_stride, width, height, \ + sum, mt); \ + } + +DEFINE_SUM(f32, float); + template inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl( F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, diff --git a/test/api/test_sum.cpp b/test/api/test_sum.cpp index e2002f4c6409b0289c3eb33766ca256c2550fd9d..3dcbb196e55d6e62d8d9d1dde187d0bb622cf583 100644 --- a/test/api/test_sum.cpp +++ b/test/api/test_sum.cpp @@ -6,6 +6,8 @@ #include +#include "framework/array.h" +#include "framework/generator.h" #include "kleidicv/kleidicv.h" TEST(Sum, Ones) { @@ -56,6 +58,30 @@ TEST(Sum, Zeroes) { EXPECT_FLOAT_EQ(0, sum); } +TEST(Sum, Random) { + test::PseudoRandomNumberGeneratorFloatRange random_generator{-99999, + 99999}; + test::Array2D src(32, 32, 0, 1); + src.fill(random_generator); + + float sum = std::numeric_limits::max(); + + EXPECT_EQ(KLEIDICV_OK, kleidicv_sum_f32(src.data(), src.stride(), src.width(), + src.height(), &sum)); + + double expected_sum = 0; + for (size_t row = 0; row < src.height(); ++row) { + for (size_t column = 0; column < src.width() / src.channels(); ++column) { + for (size_t ch = 0; ch < src.channels(); ++ch) { + expected_sum += *src.at(row, column * src.channels() + ch); + } + } + } + + // Some tolerance needed because KleidiCV uses single precision floats + EXPECT_NEAR(expected_sum, sum, 3); +} + TEST(Sum, Single) { // clang-format off float src[] = { diff --git a/test/api/test_thread_sum.cpp b/test/api/test_thread_sum.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dd53a053eff478295e82b1d7de149e7b6791f76d --- /dev/null +++ b/test/api/test_thread_sum.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" + +TEST(SumThread, CompareSingle) { + test::PseudoRandomNumberGeneratorFloatRange random_generator{-99999, + 99999}; + test::Array2D src(32, 32, 0, 1); + src.fill(random_generator); + + float sum_single = std::numeric_limits::max(); + EXPECT_EQ(KLEIDICV_OK, kleidicv_sum_f32(src.data(), src.stride(), src.width(), + src.height(), &sum_single)); + float sum_threaded = std::numeric_limits::max(); + EXPECT_EQ(KLEIDICV_OK, + kleidicv_thread_sum_f32(src.data(), src.stride(), src.width(), + src.height(), &sum_threaded, + get_multithreading_fake(3))); + EXPECT_FLOAT_EQ(sum_single, sum_threaded); +} diff --git a/test/framework/generator.h b/test/framework/generator.h index 4d87e97e805b0256f2bd4b2ea6a2f91aae890830..0f184be67da6aaf393153b3d75f1697b76e63017 100644 --- a/test/framework/generator.h +++ b/test/framework/generator.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "framework/abstract.h" #include "framework/utils.h" @@ -83,6 +84,23 @@ class PseudoRandomNumberGeneratorIntRange std::uniform_int_distribution dist_; }; // end of class PseudoRandomNumberGeneratorIntRange +template , bool> = true> +class PseudoRandomNumberGeneratorFloatRange + : public PseudoRandomNumberGenerator { + public: + PseudoRandomNumberGeneratorFloatRange(ElementType min, ElementType max) + : PseudoRandomNumberGenerator(), dist_(min, max) {} + + // Yields the next value or std::nullopt. + std::optional next() override { + return static_cast(dist_(this->rng_)); + } + + protected: + std::uniform_real_distribution dist_; +}; // end of class PseudoRandomNumberGeneratorIntRange + // Generator which yields values of an iterable container. template class SequenceGenerator : public Generator {