diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e15028aa78040edcda6c5c0f34a918693b80f08..e014dfea25c8e187f679fbba75bcffd4f339a125 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ This changelog aims to follow the guiding principles of - Filter context creation API specification. - Gaussian Blur API specification. - In the OpenCV HAL, cvtColor YUV2RGB_NV21 is multithreaded. -- In the OpenCV HAL, minMaxIdx is multithreaded when index is not requested. +- In the OpenCV HAL, minMaxIdx is multithreaded. - Improved performance of Compare Equal and Greater SC API. ### Removed diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 221b791ffda5c47fec5a490c337eeeef514c73ed..df03a09f52f5c0d8cce9c9a0d8e216d5b8d27f89 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -853,7 +853,8 @@ kleidicv_error_t call_min_max_loc(FunctionType min_max_loc_func, const uchar *src_data, size_t src_stride, int width, int height, double *min_value, double *max_value, int *min_index, - int *max_index) { + int *max_index, + kleidicv_thread_multithreading mt) { size_t tmp_min_offset, tmp_max_offset; size_t *p_min_offset = (min_value || min_index) ? &tmp_min_offset : nullptr; size_t *p_max_offset = (max_value || max_index) ? &tmp_max_offset : nullptr; @@ -861,7 +862,7 @@ kleidicv_error_t call_min_max_loc(FunctionType min_max_loc_func, kleidicv_error_t err = min_max_loc_func(reinterpret_cast(src_data), src_stride, static_cast(width), static_cast(height), - p_min_offset, p_max_offset); + p_min_offset, p_max_offset, mt); if (min_value) { *min_value = static_cast(src_data[tmp_min_offset]); } @@ -889,8 +890,8 @@ int min_max_idx(const uchar *src_data, size_t src_step, int width, int height, if (minIdx || maxIdx) { if (depth == CV_8U) { return convert_error(call_min_max_loc( - kleidicv_min_max_loc_u8, src_data, src_step, width, height, minVal, - maxVal, minIdx, maxIdx)); + kleidicv_thread_min_max_loc_u8, src_data, src_step, width, height, + minVal, maxVal, minIdx, maxIdx, get_multithreading())); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 3071e4568891d532c12bad0c059856e4122bb3d9..5a1f0f863d92cda55912e04524f898c1a34cc648 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -55,6 +55,8 @@ kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8( size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// /// Multithreaded implementation of kleidicv_min_max_u8 - see the /// documentation of that function for more details. kleidicv_error_t kleidicv_thread_min_max_u8(const uint8_t *src, @@ -62,6 +64,9 @@ kleidicv_error_t kleidicv_thread_min_max_u8(const uint8_t *src, size_t height, uint8_t *min_value, uint8_t *max_value, kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// /// Multithreaded implementation of kleidicv_min_max_s8 - see the /// documentation of that function for more details. kleidicv_error_t kleidicv_thread_min_max_s8(const int8_t *src, @@ -69,28 +74,40 @@ kleidicv_error_t kleidicv_thread_min_max_s8(const int8_t *src, size_t height, int8_t *min_value, int8_t *max_value, kleidicv_thread_multithreading); -/// Multithreaded implementation of kleidicv_thread_min_max_u16 - see the + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_min_max_u16 - see the /// documentation of that function for more details. kleidicv_error_t kleidicv_thread_min_max_u16(const uint16_t *src, size_t src_stride, size_t width, size_t height, uint16_t *min_value, uint16_t *max_value, kleidicv_thread_multithreading); -/// Multithreaded implementation of kleidicv_thread_min_max_s16 - see the + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_min_max_s16 - see the /// documentation of that function for more details. kleidicv_error_t kleidicv_thread_min_max_s16(const int16_t *src, size_t src_stride, size_t width, size_t height, int16_t *min_value, int16_t *max_value, kleidicv_thread_multithreading); -/// Multithreaded implementation of kleidicv_thread_min_max_s32 - see the + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_min_max_s32 - see the /// documentation of that function for more details. kleidicv_error_t kleidicv_thread_min_max_s32(const int32_t *src, size_t src_stride, size_t width, size_t height, int32_t *min_value, int32_t *max_value, kleidicv_thread_multithreading); -/// Multithreaded implementation of kleidicv_thread_min_max_f32 - see the + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_min_max_f32 - see the /// documentation of that function for more details. kleidicv_error_t kleidicv_thread_min_max_f32(const float *src, size_t src_stride, size_t width, @@ -98,6 +115,14 @@ kleidicv_error_t kleidicv_thread_min_max_f32(const float *src, float *max_value, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_min_max_loc_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_min_max_loc_u8( + const uint8_t *src, size_t src_stride, size_t width, size_t height, + size_t *min_offset, size_t *max_offset, kleidicv_thread_multithreading); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 2eb7aab13ecfef2dec417989098d3ac0b754955b..96de2d333eb1bdd606027bdc223b7d13e123725f 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -132,3 +132,84 @@ DEFINE_KLEIDICV_THREAD_MIN_MAX(u16, uint16_t); DEFINE_KLEIDICV_THREAD_MIN_MAX(s16, int16_t); DEFINE_KLEIDICV_THREAD_MIN_MAX(s32, int32_t); DEFINE_KLEIDICV_THREAD_MIN_MAX(f32, float); + +template +struct parallel_min_max_loc_data { + FunctionType min_max_loc_func; + const ScalarType *src; + size_t src_stride; + size_t width; + size_t *p_min_offset; + size_t *p_max_offset; +}; + +template +static kleidicv_error_t kleidicv_thread_min_max_loc_callback( + unsigned task_begin, unsigned task_end, void *void_data) { + auto *data = + reinterpret_cast *>( + void_data); + + return data->min_max_loc_func( + data->src + task_begin * (data->src_stride / sizeof(ScalarType)), + data->src_stride, data->width, task_end - task_begin, + data->p_min_offset ? data->p_min_offset + task_begin : nullptr, + data->p_max_offset ? data->p_max_offset + task_begin : nullptr); +} + +template +kleidicv_error_t parallel_min_max_loc(FunctionType min_max_loc_func, + const ScalarType *src, size_t src_stride, + size_t width, size_t height, + size_t *p_min_offset, + size_t *p_max_offset, + kleidicv_thread_multithreading mt) { + std::vector min_offsets(height, 0); + std::vector max_offsets(height, 0); + + parallel_min_max_loc_data callback_data = { + min_max_loc_func, + src, + src_stride, + width, + p_min_offset ? min_offsets.data() : nullptr, + p_max_offset ? max_offsets.data() : nullptr}; + + auto return_val = mt.parallel( + kleidicv_thread_min_max_loc_callback, + &callback_data, mt.parallel_data, height); + + if (p_min_offset) { + *p_min_offset = 0; + for (size_t i = 0; i < min_offsets.size(); ++i) { + size_t offs = min_offsets[i] + i * src_stride; + if (src[offs / sizeof(ScalarType)] < + src[*p_min_offset / sizeof(ScalarType)]) { + *p_min_offset = offs; + } + } + } + if (p_max_offset) { + *p_max_offset = 0; + for (size_t i = 0; i < max_offsets.size(); ++i) { + size_t offs = max_offsets[i] + i * src_stride; + if (src[offs / sizeof(ScalarType)] > + src[*p_max_offset / sizeof(ScalarType)]) { + *p_max_offset = offs; + } + } + } + return return_val; +} + +#define DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(suffix, type) \ + kleidicv_error_t kleidicv_thread_min_max_loc_##suffix( \ + const type *src, size_t src_stride, size_t width, size_t height, \ + size_t *p_min_offset, size_t *p_max_offset, \ + kleidicv_thread_multithreading mt) { \ + return parallel_min_max_loc(kleidicv_min_max_loc_##suffix, src, \ + src_stride, width, height, p_min_offset, \ + p_max_offset, mt); \ + } + +DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(u8, uint8_t); diff --git a/scripts/benchmark/build.sh b/scripts/benchmark/build.sh index 40b916c90c7d4d36ad041b83394054d70fbc892b..32453a53d6720042340fa5f83646523d6f1501fb 100755 --- a/scripts/benchmark/build.sh +++ b/scripts/benchmark/build.sh @@ -40,10 +40,14 @@ KLEIDICV_SOURCE_PATH="$(realpath "${SCRIPT_PATH}/..")" export COMMON_EXTRA_CMAKE_ARGS="\ -DANDROID_ABI=arm64-v8a \ -DBUILD_ANDROID_EXAMPLES=OFF \ - -DCMAKE_CXX_STANDARD=14 \ + -DBUILD_ANDROID_PROJECTS=OFF \ + -DBUILD_JAVA=OFF \ + -DWITH_QT=OFF \ + -DBUILD_OPENCV_PYTHON=NO \ + -DBUILD_OPENCV_PYTHON2=NO \ + -DBUILD_OPENCV_PYTHON3=NO \ -DBUILD_TESTS=OFF \ -DBUILD_PERF_TESTS=ON \ - -DOPENCV_DISABLE_THREAD_SUPPORT=ON \ -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON \ " diff --git a/scripts/benchmark/run_benchmarks_4K.sh b/scripts/benchmark/run_benchmarks_4K.sh index 50cfd47e8fd04dc87083bf336a394f35d7020c33..8d959ba9ea14216fc96a162a7b0e2c87ada51219 100755 --- a/scripts/benchmark/run_benchmarks_4K.sh +++ b/scripts/benchmark/run_benchmarks_4K.sh @@ -78,6 +78,8 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_S32_4K opencv_perf_core '*minMaxVals/*' '(3840x2160, 32SC1)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_F32_4K opencv_perf_core '*minMaxVals/*' '(3840x2160, 32FC1)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMaxLoc_U8_4K opencv_perf_core '*minMaxLoc*' '(3840x2160, 8UC1)')") + RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToInt opencv_perf_core '*convertTo/*' '(3840x2160, 32FC1, 8SC1, 1, 1, 0)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToUint opencv_perf_core '*convertTo/*' '(3840x2160, 32FC1, 8UC1, 1, 1, 0)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL IntToFloat opencv_perf_core '*convertTo/*' '(3840x2160, 8SC1, 32FC1, 1, 1, 0)')") diff --git a/scripts/benchmark/run_benchmarks_FHD.sh b/scripts/benchmark/run_benchmarks_FHD.sh index 234e2767d205e9729919d4d598afef847b5c6f81..fe4855980c7e28a1fbb5d856c13014d57c783dae 100755 --- a/scripts/benchmark/run_benchmarks_FHD.sh +++ b/scripts/benchmark/run_benchmarks_FHD.sh @@ -78,6 +78,8 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_S32_FHD opencv_perf_core '*minMaxVals/*' '(1920x1080, 32SC1)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_F32_FHD opencv_perf_core '*minMaxVals/*' '(1920x1080, 32FC1)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMaxLoc_U8_FHD opencv_perf_core '*minMaxLoc/*' '(1920x1080, 8UC1)')") + RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToInt opencv_perf_core '*convertTo/*' '(1920x1080, 32FC1, 8SC1, 1, 1, 0)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToUint opencv_perf_core '*convertTo/*' '(1920x1080, 32FC1, 8UC1, 1, 1, 0)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL IntToFloat opencv_perf_core '*convertTo/*' '(1920x1080, 8SC1, 32FC1, 1, 1, 0)')") diff --git a/test/api/test_thread_min_max.cpp b/test/api/test_thread_min_max.cpp index 088f716bfba9ac82d1f4849802ad8ce7db5c6ce3..83bcb6aa281b2bd47e72cf5d07de6b9bdc790f8e 100644 --- a/test/api/test_thread_min_max.cpp +++ b/test/api/test_thread_min_max.cpp @@ -34,9 +34,9 @@ KLEIDICV_THREAD_MIN_MAX(int32_t, s32); KLEIDICV_THREAD_MIN_MAX(float, f32); template -class Thread : public testing::Test {}; +class MinMaxThread : public testing::Test {}; -TYPED_TEST_SUITE_P(Thread); +TYPED_TEST_SUITE_P(MinMaxThread); // Tuple of width, height, thread count. typedef std::tuple P; @@ -45,7 +45,7 @@ static const auto test_params = { P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5}}; -TYPED_TEST_P(Thread, CompareWithSingle) { +TYPED_TEST_P(MinMaxThread, CompareWithSingle) { size_t width = 0, height = 0, thread_count = 0; for (auto params : test_params) { std::tie(width, height, thread_count) = params; @@ -69,7 +69,7 @@ TYPED_TEST_P(Thread, CompareWithSingle) { } } -TYPED_TEST_P(Thread, NullArguments) { +TYPED_TEST_P(MinMaxThread, NullArguments) { size_t width = 1, height = 2, thread_count = 2; TypeParam src[2] = {1, 2}, min_value, max_value; @@ -101,9 +101,9 @@ TYPED_TEST_P(Thread, NullArguments) { EXPECT_EQ(0, max_value); } -REGISTER_TYPED_TEST_SUITE_P(Thread, CompareWithSingle, NullArguments); +REGISTER_TYPED_TEST_SUITE_P(MinMaxThread, CompareWithSingle, NullArguments); using MinMaxElementTypes = ::testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(MinMax, Thread, MinMaxElementTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(MinMax, MinMaxThread, MinMaxElementTypes); diff --git a/test/api/test_thread_min_max_loc.cpp b/test/api/test_thread_min_max_loc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..34979cb2a93478fbf1b156e56836d889584a9ded --- /dev/null +++ b/test/api/test_thread_min_max_loc.cpp @@ -0,0 +1,103 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" + +#define KLEIDICV_MIN_MAX_LOC(type, suffix) \ + KLEIDICV_API(min_max_loc, kleidicv_min_max_loc_##suffix, type) + +KLEIDICV_MIN_MAX_LOC(uint8_t, u8); + +#define KLEIDICV_THREAD_MIN_MAX_LOC(type, suffix) \ + KLEIDICV_API(thread_min_max_loc, kleidicv_thread_min_max_loc_##suffix, type) + +KLEIDICV_THREAD_MIN_MAX_LOC(uint8_t, u8); + +template +class MinMaxLocThread : public testing::Test {}; + +TYPED_TEST_SUITE_P(MinMaxLocThread); + +// Tuple of width, height, thread count. +typedef std::tuple P; + +static const auto test_params = { + P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, + P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5}}; + +TYPED_TEST_P(MinMaxLocThread, CompareWithSingle) { + size_t width = 0, height = 0, thread_count = 0; + for (auto params : test_params) { + std::tie(width, height, thread_count) = params; + test::Array2D src(width, height); + size_t min_offset_single = 0, max_offset_single = 0, min_offset_multi = 0, + max_offset_multi = 0; + + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + + kleidicv_error_t single_result = + min_max_loc()(src.data(), src.stride(), width, height, + &min_offset_single, &max_offset_single); + + kleidicv_error_t multi_result = thread_min_max_loc()( + src.data(), src.stride(), width, height, &min_offset_multi, + &max_offset_multi, get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ(min_offset_multi, min_offset_single); + EXPECT_EQ(max_offset_multi, max_offset_single); + } +} + +TYPED_TEST_P(MinMaxLocThread, NullArguments) { + size_t width = 1, height = 2, thread_count = 2; + TypeParam src[2] = {1, 2}; + // Let it be different from 0, 1 or 2 + const size_t kUnchanged = 99; + size_t min_offset = kUnchanged, max_offset = kUnchanged; + + kleidicv_error_t res = thread_min_max_loc()( + src, width * sizeof(TypeParam), width, height, nullptr, &max_offset, + get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, res); + EXPECT_EQ(kUnchanged, min_offset); + EXPECT_EQ(1, max_offset); + + min_offset = max_offset = 99; + res = thread_min_max_loc()(src, width * sizeof(TypeParam), width, + height, &min_offset, nullptr, + get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, res); + EXPECT_EQ(0, min_offset); + EXPECT_EQ(kUnchanged, max_offset); + + min_offset = max_offset = 99; + res = thread_min_max_loc()(src, width * sizeof(TypeParam), width, + height, nullptr, nullptr, + get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, res); + EXPECT_EQ(kUnchanged, min_offset); + EXPECT_EQ(kUnchanged, max_offset); +} + +REGISTER_TYPED_TEST_SUITE_P(MinMaxLocThread, CompareWithSingle, NullArguments); + +using MinMaxLocElementTypes = ::testing::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(MinMaxLoc, MinMaxLocThread, + MinMaxLocElementTypes);