From 80c74d0bc2cf7a0f6372c6c7ebf990864e30721f Mon Sep 17 00:00:00 2001 From: Michael Platings Date: Tue, 18 Jun 2024 15:49:29 +0100 Subject: [PATCH] Speed up benchmarks Much of the benchmark time was spent initializing buffers. This change reduces that time significantly. Additionally, to ensure more consistent benchmark results exclude the first iteration from measurements. While refactoring, errors were found in many stride arguments. These have been fixed. --- benchmark/benchmark.cpp | 222 ++++++++++++++++------------------------ 1 file changed, 86 insertions(+), 136 deletions(-) diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 8155ebd94..096207f94 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -12,27 +12,57 @@ extern size_t image_width, image_height; -template -static void bench_binary_op(Function f, benchmark::State& state) { - // Setup - std::vector src_a, src_b, dst; - src_a.resize(image_width * image_height); - src_b.resize(image_width * image_height); - dst.resize(image_width * image_height); +// Setting up buffers can be time-consuming and is not of interest when +// benchmarking. Therefore endeavour to reuse buffers where possible. This +// function creates a single shared buffer for each pair of template arguments. +// All bytes of the buffer are filled with the Value argument. +template +uint8_t* get_buffer() { + static std::vector result(image_width * image_height * PixelSize, + Value); + return result.data(); +} - std::mt19937 generator; - std::generate(src_a.begin(), src_a.end(), generator); - std::generate(src_b.begin(), src_b.end(), generator); +// Get a buffer suitable for using as the first input buffer. +template +const T* get_source_buffer_a() { + return reinterpret_cast(get_buffer()); +} +// Get a buffer suitable for using as the second input buffer. +template +const T* get_source_buffer_b() { + return reinterpret_cast(get_buffer()); +} + +// Get a buffer suitable for using as the destination buffer. +template +T* get_destination_buffer() { + // Value argument is only used here to differentiate from the source buffers. + return reinterpret_cast(get_buffer()); +} + +// Warms up the functor then benchmarks it. +template +void bench_functor(benchmark::State& state, F functor) { + // warm up + functor(); for (auto _ : state) { // This code gets benchmarked - auto unused = f(src_a.data(), image_width * sizeof(T), src_b.data(), - image_width * sizeof(T), dst.data(), - image_width * sizeof(T), image_width, image_height); - (void)unused; + functor(); } } +template +static void bench_binary_op(Function f, benchmark::State& state) { + bench_functor(state, [f]() { + (void)f(get_source_buffer_a(), image_width * sizeof(T), + get_source_buffer_b(), image_width * sizeof(T), + get_destination_buffer(), image_width * sizeof(T), image_width, + image_height); + }); +} + #define BENCH_BINARY_OP(name, type) \ static void name(benchmark::State& state) { \ bench_binary_op(kleidicv_##name, state); \ @@ -46,58 +76,31 @@ BENCH_BINARY_OP(bitwise_and, uint8_t); BENCH_BINARY_OP(compare_equal_u8, uint8_t); BENCH_BINARY_OP(compare_greater_u8, uint8_t); -template -static void bench_unary_op(Function f, size_t channels, - benchmark::State& state) { - // Setup - std::vector src, dst; - src.resize(image_width * image_height * channels); - dst.resize(image_width * image_height * channels); - - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - - for (auto _ : state) { - // This code gets benchmarked - auto unused = f(src.data(), image_width * sizeof(T), dst.data(), - image_width * sizeof(T), image_width, image_height); - (void)unused; - } +template +static void bench_unary_op(Function f, benchmark::State& state) { + bench_functor(state, [f]() { + (void)f(get_source_buffer_a(), + image_width * Channels * sizeof(I), + get_destination_buffer(), + image_width * Channels * sizeof(O), image_width, image_height); + }); } -#define BENCH_UNARY_OP(name, channels, type) \ - static void name(benchmark::State& state) { \ - bench_unary_op(kleidicv_##name, channels, state); \ - } \ +#define BENCH_UNARY_OP(name, channels, type) \ + static void name(benchmark::State& state) { \ + bench_unary_op(kleidicv_##name, state); \ + } \ BENCHMARK(name) BENCH_UNARY_OP(rgb_to_yuv_u8, 3, uint8_t); BENCH_UNARY_OP(rgba_to_yuv_u8, 4, uint8_t); BENCH_UNARY_OP(bgr_to_yuv_u8, 3, uint8_t); BENCH_UNARY_OP(bgra_to_yuv_u8, 4, uint8_t); - -template -static void bench_unary_op(Function f, benchmark::State& state) { - // Setup - std::vector src; - std::vector dst; - src.resize(image_width * image_height); - dst.resize(image_width * image_height); - - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - - for (auto _ : state) { - // This code gets benchmarked - auto unused = f(src.data(), image_width, dst.data(), image_width, - image_width, image_height); - (void)unused; - } -} +BENCH_UNARY_OP(exp_f32, 1, float); #define BENCH_UNARY_OP_DIFFERENT_IO_TYPES(name, itype, otype) \ static void name(benchmark::State& state) { \ - bench_unary_op(kleidicv_##name, state); \ + bench_unary_op(kleidicv_##name, state); \ } \ BENCHMARK(name) @@ -107,42 +110,23 @@ BENCH_UNARY_OP_DIFFERENT_IO_TYPES(float_conversion_s8_f32, int8_t, float); BENCH_UNARY_OP_DIFFERENT_IO_TYPES(float_conversion_u8_f32, uint8_t, float); static void min_max_loc_u8(benchmark::State& state) { - // Setup - std::vector src; - src.resize(image_width * image_height); - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - - size_t min_offset = 0, max_offset = 0; - - for (auto _ : state) { - // This code gets benchmarked - auto unused = kleidicv_min_max_loc_u8( - src.data(), image_width * sizeof(uint8_t), image_width, image_height, - &min_offset, &max_offset); - (void)unused; - } + bench_functor(state, []() { + size_t min_offset, max_offset; + (void)kleidicv_min_max_loc_u8(get_source_buffer_a(), + image_width * sizeof(uint8_t), image_width, + image_height, &min_offset, &max_offset); + }); } BENCHMARK(min_max_loc_u8); template static void scale(Function f, float factor, float shift, benchmark::State& state) { - // Setup - std::vector src, dst; - src.resize(image_width * image_height); - dst.resize(image_width * image_height); - - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - - for (auto _ : state) { - // This code gets benchmarked - auto unused = - f(src.data(), image_width * sizeof(T), dst.data(), - image_width * sizeof(T), image_width, image_height, factor, shift); - (void)unused; - } + bench_functor(state, [f, factor, shift]() { + (void)f(get_source_buffer_a(), image_width * sizeof(T), + get_destination_buffer(), image_width * sizeof(T), image_width, + image_height, factor, shift); + }); } #define BENCH_SCALE(benchname, name, factor, shift, type) \ @@ -195,18 +179,12 @@ static void resize_linear(F f, size_t scale_x, size_t scale_y, size_t src_height = image_height / scale_y; size_t dst_width = src_width * scale_x; size_t dst_height = src_height * scale_y; - std::vector src, dst; - src.resize(src_width * src_height); - dst.resize(dst_width * dst_height); - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - for (auto _ : state) { - // This code gets benchmarked - auto unused = f(src.data(), src_width * sizeof(T), src_width, src_height, - dst.data(), dst_width * sizeof(T), dst_width, dst_height); - (void)unused; - } + bench_functor(state, [f, src_width, src_height, dst_width, dst_height]() { + (void)f(get_source_buffer_a(), src_width * sizeof(T), src_width, + src_height, get_destination_buffer(), dst_width * sizeof(T), + dst_width, dst_height); + }); } static void resize_linear_2x2_u8(benchmark::State& state) { @@ -229,62 +207,34 @@ static void resize_linear_4x4_f32(benchmark::State& state) { } BENCHMARK(resize_linear_4x4_f32); -template -static void gaussian_blur(Function f, size_t channels, - benchmark::State& state) { - // Setup - std::vector src, dst; - src.resize(image_width * image_height * channels); - dst.resize(image_width * image_height * channels); - - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - +template +static void gaussian_blur(Function f, benchmark::State& state) { kleidicv_filter_context_t* context; kleidicv_error_t err = - kleidicv_filter_create(&context, channels, 2 * sizeof(T), + kleidicv_filter_create(&context, Channels, 2 * sizeof(T), kleidicv_rectangle_t{image_width, image_height}); if (err != KLEIDICV_OK) { state.SkipWithError("Could not initialize Gaussian blur filter."); return; } - for (auto _ : state) { - // This code gets benchmarked - auto unused = f(src.data(), image_width * sizeof(T), dst.data(), - image_width * sizeof(T), image_width, image_height, - channels, KLEIDICV_BORDER_TYPE_REFLECT, context); - (void)unused; - } + bench_functor(state, [f, context]() { + (void)f(get_source_buffer_a(), + image_width * Channels * sizeof(T), + get_destination_buffer(), + image_width * Channels * sizeof(T), image_width, image_height, + Channels, KLEIDICV_BORDER_TYPE_REFLECT, context); + }); (void)kleidicv_filter_release(context); } static void gaussian_blur_7x7_u8_1ch(benchmark::State& state) { - gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 1, state); + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, state); } BENCHMARK(gaussian_blur_7x7_u8_1ch); static void gaussian_blur_7x7_u8_3ch(benchmark::State& state) { - gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 3, state); + gaussian_blur(kleidicv_gaussian_blur_7x7_u8, state); } BENCHMARK(gaussian_blur_7x7_u8_3ch); - -static void exp_f32(benchmark::State& state) { - // Setup - std::vector src, dst; - src.resize(image_width * image_height); - dst.resize(image_width * image_height); - - std::mt19937 generator; - std::generate(src.begin(), src.end(), generator); - - for (auto _ : state) { - // This code gets benchmarked - auto unused = kleidicv_exp_f32(src.data(), image_width * sizeof(float), - dst.data(), image_width * sizeof(float), - image_width, image_height); - (void)unused; - } -} -BENCHMARK(exp_f32); -- GitLab