From 80c74d0bc2cf7a0f6372c6c7ebf990864e30721f Mon Sep 17 00:00:00 2001
From: Michael Platings <michael.platings@arm.com>
Date: Tue, 18 Jun 2024 15:49:29 +0100
Subject: [PATCH] Speed up benchmarks

Much of the benchmark time was spent initializing buffers. This change
reduces that time significantly.
Additionally, to ensure more consistent benchmark results exclude the
first iteration from measurements.

While refactoring, errors were found in many stride arguments. These
have been fixed.
---
 benchmark/benchmark.cpp | 222 ++++++++++++++++------------------------
 1 file changed, 86 insertions(+), 136 deletions(-)
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 8155ebd94..096207f94 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -12,27 +12,57 @@
 
 extern size_t image_width, image_height;
 
-template <typename T, typename Function>
-static void bench_binary_op(Function f, benchmark::State& state) {
-  // Setup
-  std::vector<T> src_a, src_b, dst;
-  src_a.resize(image_width * image_height);
-  src_b.resize(image_width * image_height);
-  dst.resize(image_width * image_height);
+// Setting up buffers can be time-consuming and is not of interest when
+// benchmarking. Therefore endeavour to reuse buffers where possible. This
+// function creates a single shared buffer for each pair of template arguments.
+// All bytes of the buffer are filled with the Value argument.
+template <int PixelSize, int Value>
+uint8_t* get_buffer() {
+  static std::vector<uint8_t> result(image_width * image_height * PixelSize,
+                                     Value);
+  return result.data();
+}
 
-  std::mt19937 generator;
-  std::generate(src_a.begin(), src_a.end(), generator);
-  std::generate(src_b.begin(), src_b.end(), generator);
+// Get a buffer suitable for using as the first input buffer.
+template <typename T, int Channels = 1>
+const T* get_source_buffer_a() {
+  return reinterpret_cast<T*>(get_buffer<sizeof(T) * Channels, 0xA3>());
+}
 
+// Get a buffer suitable for using as the second input buffer.
+template <typename T, int Channels = 1>
+const T* get_source_buffer_b() {
+  return reinterpret_cast<T*>(get_buffer<sizeof(T) * Channels, 0x9E>());
+}
+
+// Get a buffer suitable for using as the destination buffer.
+template <typename T, int Channels = 1>
+T* get_destination_buffer() {
+  // Value argument is only used here to differentiate from the source buffers.
+  return reinterpret_cast<T*>(get_buffer<sizeof(T) * Channels, 0xC1>());
+}
+
+// Warms up the functor then benchmarks it.
+template <typename F>
+void bench_functor(benchmark::State& state, F functor) {
+  // warm up
+  functor();
   for (auto _ : state) {
     // This code gets benchmarked
-    auto unused = f(src_a.data(), image_width * sizeof(T), src_b.data(),
-                    image_width * sizeof(T), dst.data(),
-                    image_width * sizeof(T), image_width, image_height);
-    (void)unused;
+    functor();
   }
 }
 
+template <typename T, typename Function>
+static void bench_binary_op(Function f, benchmark::State& state) {
+  bench_functor(state, [f]() {
+    (void)f(get_source_buffer_a<T>(), image_width * sizeof(T),
+            get_source_buffer_b<T>(), image_width * sizeof(T),
+            get_destination_buffer<T>(), image_width * sizeof(T), image_width,
+            image_height);
+  });
+}
+
 #define BENCH_BINARY_OP(name, type)                \
   static void name(benchmark::State& state) {      \
     bench_binary_op<type>(kleidicv_##name, state); \
@@ -46,58 +76,31 @@ BENCH_BINARY_OP(bitwise_and, uint8_t);
 BENCH_BINARY_OP(compare_equal_u8, uint8_t);
 BENCH_BINARY_OP(compare_greater_u8, uint8_t);
 
-template <typename T, typename Function>
-static void bench_unary_op(Function f, size_t channels,
-                           benchmark::State& state) {
-  // Setup
-  std::vector<T> src, dst;
-  src.resize(image_width * image_height * channels);
-  dst.resize(image_width * image_height * channels);
-
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
-
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused = f(src.data(), image_width * sizeof(T), dst.data(),
-                    image_width * sizeof(T), image_width, image_height);
-    (void)unused;
-  }
+template <typename I, typename O, int Channels, typename Function>
+static void bench_unary_op(Function f, benchmark::State& state) {
+  bench_functor(state, [f]() {
+    (void)f(get_source_buffer_a<I, Channels>(),
+            image_width * Channels * sizeof(I),
+            get_destination_buffer<O, Channels>(),
+            image_width * Channels * sizeof(O), image_width, image_height);
+  });
 }
 
-#define BENCH_UNARY_OP(name, channels, type)                \
-  static void name(benchmark::State& state) {               \
-    bench_unary_op<type>(kleidicv_##name, channels, state); \
-  }                                                         \
+#define BENCH_UNARY_OP(name, channels, type)                      \
+  static void name(benchmark::State& state) {                     \
+    bench_unary_op<type, type, channels>(kleidicv_##name, state); \
+  }                                                               \
   BENCHMARK(name)
 
 BENCH_UNARY_OP(rgb_to_yuv_u8, 3, uint8_t);
 BENCH_UNARY_OP(rgba_to_yuv_u8, 4, uint8_t);
 BENCH_UNARY_OP(bgr_to_yuv_u8, 3, uint8_t);
 BENCH_UNARY_OP(bgra_to_yuv_u8, 4, uint8_t);
-
-template <typename I, typename O, typename Function>
-static void bench_unary_op(Function f, benchmark::State& state) {
-  // Setup
-  std::vector<I> src;
-  std::vector<O> dst;
-  src.resize(image_width * image_height);
-  dst.resize(image_width * image_height);
-
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
-
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused = f(src.data(), image_width, dst.data(), image_width,
-                    image_width, image_height);
-    (void)unused;
-  }
-}
+BENCH_UNARY_OP(exp_f32, 1, float);
 
 #define BENCH_UNARY_OP_DIFFERENT_IO_TYPES(name, itype, otype) \
   static void name(benchmark::State& state) {                 \
-    bench_unary_op<itype, otype>(kleidicv_##name, state);     \
+    bench_unary_op<itype, otype, 1>(kleidicv_##name, state);  \
   }                                                           \
   BENCHMARK(name)
 
@@ -107,42 +110,23 @@ BENCH_UNARY_OP_DIFFERENT_IO_TYPES(float_conversion_s8_f32, int8_t, float);
 BENCH_UNARY_OP_DIFFERENT_IO_TYPES(float_conversion_u8_f32, uint8_t, float);
 
 static void min_max_loc_u8(benchmark::State& state) {
-  // Setup
-  std::vector<uint8_t> src;
-  src.resize(image_width * image_height);
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
-
-  size_t min_offset = 0, max_offset = 0;
-
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused = kleidicv_min_max_loc_u8(
-        src.data(), image_width * sizeof(uint8_t), image_width, image_height,
-        &min_offset, &max_offset);
-    (void)unused;
-  }
+  bench_functor(state, []() {
+    size_t min_offset, max_offset;
+    (void)kleidicv_min_max_loc_u8(get_source_buffer_a<uint8_t>(),
+                                  image_width * sizeof(uint8_t), image_width,
+                                  image_height, &min_offset, &max_offset);
+  });
 }
 BENCHMARK(min_max_loc_u8);
 
 template <typename T, typename Function>
 static void scale(Function f, float factor, float shift,
                   benchmark::State& state) {
-  // Setup
-  std::vector<T> src, dst;
-  src.resize(image_width * image_height);
-  dst.resize(image_width * image_height);
-
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
-
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused =
-        f(src.data(), image_width * sizeof(T), dst.data(),
-          image_width * sizeof(T), image_width, image_height, factor, shift);
-    (void)unused;
-  }
+  bench_functor(state, [f, factor, shift]() {
+    (void)f(get_source_buffer_a<T>(), image_width * sizeof(T),
+            get_destination_buffer<T>(), image_width * sizeof(T), image_width,
+            image_height, factor, shift);
+  });
 }
 
 #define BENCH_SCALE(benchname, name, factor, shift, type) \
@@ -195,18 +179,12 @@ static void resize_linear(F f, size_t scale_x, size_t scale_y,
   size_t src_height = image_height / scale_y;
   size_t dst_width = src_width * scale_x;
   size_t dst_height = src_height * scale_y;
-  std::vector<T> src, dst;
-  src.resize(src_width * src_height);
-  dst.resize(dst_width * dst_height);
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
 
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused = f(src.data(), src_width * sizeof(T), src_width, src_height,
-                    dst.data(), dst_width * sizeof(T), dst_width, dst_height);
-    (void)unused;
-  }
+  bench_functor(state, [f, src_width, src_height, dst_width, dst_height]() {
+    (void)f(get_source_buffer_a<T>(), src_width * sizeof(T), src_width,
+            src_height, get_destination_buffer<T>(), dst_width * sizeof(T),
+            dst_width, dst_height);
+  });
 }
 
 static void resize_linear_2x2_u8(benchmark::State& state) {
@@ -229,62 +207,34 @@ static void resize_linear_4x4_f32(benchmark::State& state) {
 }
 BENCHMARK(resize_linear_4x4_f32);
 
-template <typename T, typename Function>
-static void gaussian_blur(Function f, size_t channels,
-                          benchmark::State& state) {
-  // Setup
-  std::vector<T> src, dst;
-  src.resize(image_width * image_height * channels);
-  dst.resize(image_width * image_height * channels);
-
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
-
+template <typename T, int Channels, typename Function>
+static void gaussian_blur(Function f, benchmark::State& state) {
   kleidicv_filter_context_t* context;
   kleidicv_error_t err =
-      kleidicv_filter_create(&context, channels, 2 * sizeof(T),
+      kleidicv_filter_create(&context, Channels, 2 * sizeof(T),
                              kleidicv_rectangle_t{image_width, image_height});
   if (err != KLEIDICV_OK) {
     state.SkipWithError("Could not initialize Gaussian blur filter.");
     return;
   }
 
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused = f(src.data(), image_width * sizeof(T), dst.data(),
-                    image_width * sizeof(T), image_width, image_height,
-                    channels, KLEIDICV_BORDER_TYPE_REFLECT, context);
-    (void)unused;
-  }
+  bench_functor(state, [f, context]() {
+    (void)f(get_source_buffer_a<T, Channels>(),
+            image_width * Channels * sizeof(T),
+            get_destination_buffer<T, Channels>(),
+            image_width * Channels * sizeof(T), image_width, image_height,
+            Channels, KLEIDICV_BORDER_TYPE_REFLECT, context);
+  });
 
   (void)kleidicv_filter_release(context);
 }
 
 static void gaussian_blur_7x7_u8_1ch(benchmark::State& state) {
-  gaussian_blur<uint8_t>(kleidicv_gaussian_blur_7x7_u8, 1, state);
+  gaussian_blur<uint8_t, 1>(kleidicv_gaussian_blur_7x7_u8, state);
 }
 BENCHMARK(gaussian_blur_7x7_u8_1ch);
 
 static void gaussian_blur_7x7_u8_3ch(benchmark::State& state) {
-  gaussian_blur<uint8_t>(kleidicv_gaussian_blur_7x7_u8, 3, state);
+  gaussian_blur<uint8_t, 3>(kleidicv_gaussian_blur_7x7_u8, state);
 }
 BENCHMARK(gaussian_blur_7x7_u8_3ch);
-
-static void exp_f32(benchmark::State& state) {
-  // Setup
-  std::vector<float> src, dst;
-  src.resize(image_width * image_height);
-  dst.resize(image_width * image_height);
-
-  std::mt19937 generator;
-  std::generate(src.begin(), src.end(), generator);
-
-  for (auto _ : state) {
-    // This code gets benchmarked
-    auto unused = kleidicv_exp_f32(src.data(), image_width * sizeof(float),
-                                   dst.data(), image_width * sizeof(float),
-                                   image_width, image_height);
-    (void)unused;
-  }
-}
-BENCHMARK(exp_f32);
-- 
GitLab