From dff42a9d15d30cbd972c0565ae904f18a2d19853 Mon Sep 17 00:00:00 2001
From: Mark Horvath <mark.horvath@arm.com>
Date: Fri, 17 May 2024 16:34:32 +0200
Subject: [PATCH 1/4] Add NEON store intrinsics for float32_t

---
 kleidicv/include/kleidicv/neon_intrinsics.h | 114 ++++++++++----------
 1 file changed, 60 insertions(+), 54 deletions(-)

diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h
index 22ef062f1..ed11762fb 100644
--- a/kleidicv/include/kleidicv/neon_intrinsics.h
+++ b/kleidicv/include/kleidicv/neon_intrinsics.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -343,59 +343,65 @@ static inline float32x4x4_t vld1q_x4(const float32_t *src) { return vld1q_f32_x4
 
 static inline void vst1(uint8_t *dst, uint8x8_t vec) { vst1_u8(dst, vec); }
 
-static inline void vst1q(int8_t   *dst, int8x16_t  vec) { vst1q_s8(dst, vec); }
-static inline void vst1q(uint8_t  *dst, uint8x16_t vec) { vst1q_u8(dst, vec); }
-static inline void vst1q(int16_t  *dst, int16x8_t  vec) { vst1q_s16(dst, vec); }
-static inline void vst1q(uint16_t *dst, uint16x8_t vec) { vst1q_u16(dst, vec); }
-static inline void vst1q(int32_t  *dst, int32x4_t  vec) { vst1q_s32(dst, vec); }
-static inline void vst1q(uint32_t *dst, uint32x4_t vec) { vst1q_u32(dst, vec); }
-static inline void vst1q(int64_t  *dst, int64x2_t  vec) { vst1q_s64(dst, vec); }
-static inline void vst1q(uint64_t *dst, uint64x2_t vec) { vst1q_u64(dst, vec); }
-
-static inline void vst2q(int8_t   *dst, int8x16x2_t  vec) { vst2q_s8(dst, vec); }
-static inline void vst2q(uint8_t  *dst, uint8x16x2_t vec) { vst2q_u8(dst, vec); }
-static inline void vst2q(int16_t  *dst, int16x8x2_t  vec) { vst2q_s16(dst, vec); }
-static inline void vst2q(uint16_t *dst, uint16x8x2_t vec) { vst2q_u16(dst, vec); }
-static inline void vst2q(int32_t  *dst, int32x4x2_t  vec) { vst2q_s32(dst, vec); }
-static inline void vst2q(uint32_t *dst, uint32x4x2_t vec) { vst2q_u32(dst, vec); }
-static inline void vst2q(int64_t  *dst, int64x2x2_t  vec) { vst2q_s64(dst, vec); }
-static inline void vst2q(uint64_t *dst, uint64x2x2_t vec) { vst2q_u64(dst, vec); }
-
-static inline void vst3q(int8_t   *dst, int8x16x3_t  vec) { vst3q_s8(dst, vec); }
-static inline void vst3q(uint8_t  *dst, uint8x16x3_t vec) { vst3q_u8(dst, vec); }
-static inline void vst3q(int16_t  *dst, int16x8x3_t  vec) { vst3q_s16(dst, vec); }
-static inline void vst3q(uint16_t *dst, uint16x8x3_t vec) { vst3q_u16(dst, vec); }
-static inline void vst3q(int32_t  *dst, int32x4x3_t  vec) { vst3q_s32(dst, vec); }
-static inline void vst3q(uint32_t *dst, uint32x4x3_t vec) { vst3q_u32(dst, vec); }
-static inline void vst3q(int64_t  *dst, int64x2x3_t  vec) { vst3q_s64(dst, vec); }
-static inline void vst3q(uint64_t *dst, uint64x2x3_t vec) { vst3q_u64(dst, vec); }
-
-static inline void vst4q(int8_t   *dst, int8x16x4_t  vec) { vst4q_s8(dst, vec); }
-static inline void vst4q(uint8_t  *dst, uint8x16x4_t vec) { vst4q_u8(dst, vec); }
-static inline void vst4q(int16_t  *dst, int16x8x4_t  vec) { vst4q_s16(dst, vec); }
-static inline void vst4q(uint16_t *dst, uint16x8x4_t vec) { vst4q_u16(dst, vec); }
-static inline void vst4q(int32_t  *dst, int32x4x4_t  vec) { vst4q_s32(dst, vec); }
-static inline void vst4q(uint32_t *dst, uint32x4x4_t vec) { vst4q_u32(dst, vec); }
-static inline void vst4q(int64_t  *dst, int64x2x4_t  vec) { vst4q_s64(dst, vec); }
-static inline void vst4q(uint64_t *dst, uint64x2x4_t vec) { vst4q_u64(dst, vec); }
-
-static inline void vst1q_x2(int8_t   *dst, int8x16x2_t  vec) { vst1q_s8_x2(dst, vec); }
-static inline void vst1q_x2(uint8_t  *dst, uint8x16x2_t vec) { vst1q_u8_x2(dst, vec); }
-static inline void vst1q_x2(int16_t  *dst, int16x8x2_t  vec) { vst1q_s16_x2(dst, vec); }
-static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) { vst1q_u16_x2(dst, vec); }
-static inline void vst1q_x2(int32_t  *dst, int32x4x2_t  vec) { vst1q_s32_x2(dst, vec); }
-static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) { vst1q_u32_x2(dst, vec); }
-static inline void vst1q_x2(int64_t  *dst, int64x2x2_t  vec) { vst1q_s64_x2(dst, vec); }
-static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) { vst1q_u64_x2(dst, vec); }
-
-static inline void vst1q_x4(int8_t   *dst, int8x16x4_t  vec) { vst1q_s8_x4(dst, vec); }
-static inline void vst1q_x4(uint8_t  *dst, uint8x16x4_t vec) { vst1q_u8_x4(dst, vec); }
-static inline void vst1q_x4(int16_t  *dst, int16x8x4_t  vec) { vst1q_s16_x4(dst, vec); }
-static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) { vst1q_u16_x4(dst, vec); }
-static inline void vst1q_x4(int32_t  *dst, int32x4x4_t  vec) { vst1q_s32_x4(dst, vec); }
-static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) { vst1q_u32_x4(dst, vec); }
-static inline void vst1q_x4(int64_t  *dst, int64x2x4_t  vec) { vst1q_s64_x4(dst, vec); }
-static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) { vst1q_u64_x4(dst, vec); }
+static inline void vst1q(int8_t    *dst, int8x16_t   vec) { vst1q_s8(dst, vec); }
+static inline void vst1q(uint8_t   *dst, uint8x16_t  vec) { vst1q_u8(dst, vec); }
+static inline void vst1q(int16_t   *dst, int16x8_t   vec) { vst1q_s16(dst, vec); }
+static inline void vst1q(uint16_t  *dst, uint16x8_t  vec) { vst1q_u16(dst, vec); }
+static inline void vst1q(int32_t   *dst, int32x4_t   vec) { vst1q_s32(dst, vec); }
+static inline void vst1q(uint32_t  *dst, uint32x4_t  vec) { vst1q_u32(dst, vec); }
+static inline void vst1q(int64_t   *dst, int64x2_t   vec) { vst1q_s64(dst, vec); }
+static inline void vst1q(uint64_t  *dst, uint64x2_t  vec) { vst1q_u64(dst, vec); }
+static inline void vst1q(float32_t *dst, float32x4_t vec) { vst1q_f32(dst, vec); }
+
+static inline void vst2q(int8_t    *dst, int8x16x2_t   vec) { vst2q_s8(dst, vec); }
+static inline void vst2q(uint8_t   *dst, uint8x16x2_t  vec) { vst2q_u8(dst, vec); }
+static inline void vst2q(int16_t   *dst, int16x8x2_t   vec) { vst2q_s16(dst, vec); }
+static inline void vst2q(uint16_t  *dst, uint16x8x2_t  vec) { vst2q_u16(dst, vec); }
+static inline void vst2q(int32_t   *dst, int32x4x2_t   vec) { vst2q_s32(dst, vec); }
+static inline void vst2q(uint32_t  *dst, uint32x4x2_t  vec) { vst2q_u32(dst, vec); }
+static inline void vst2q(int64_t   *dst, int64x2x2_t   vec) { vst2q_s64(dst, vec); }
+static inline void vst2q(uint64_t  *dst, uint64x2x2_t  vec) { vst2q_u64(dst, vec); }
+static inline void vst2q(float32_t *dst, float32x4x2_t vec) { vst2q_f32(dst, vec); }
+
+static inline void vst3q(int8_t    *dst, int8x16x3_t   vec) { vst3q_s8(dst, vec); }
+static inline void vst3q(uint8_t   *dst, uint8x16x3_t  vec) { vst3q_u8(dst, vec); }
+static inline void vst3q(int16_t   *dst, int16x8x3_t   vec) { vst3q_s16(dst, vec); }
+static inline void vst3q(uint16_t  *dst, uint16x8x3_t  vec) { vst3q_u16(dst, vec); }
+static inline void vst3q(int32_t   *dst, int32x4x3_t   vec) { vst3q_s32(dst, vec); }
+static inline void vst3q(uint32_t  *dst, uint32x4x3_t  vec) { vst3q_u32(dst, vec); }
+static inline void vst3q(int64_t   *dst, int64x2x3_t   vec) { vst3q_s64(dst, vec); }
+static inline void vst3q(uint64_t  *dst, uint64x2x3_t  vec) { vst3q_u64(dst, vec); }
+static inline void vst3q(float32_t *dst, float32x4x3_t vec) { vst3q_f32(dst, vec); }
+
+static inline void vst4q(int8_t    *dst, int8x16x4_t   vec) { vst4q_s8(dst, vec); }
+static inline void vst4q(uint8_t   *dst, uint8x16x4_t  vec) { vst4q_u8(dst, vec); }
+static inline void vst4q(int16_t   *dst, int16x8x4_t   vec) { vst4q_s16(dst, vec); }
+static inline void vst4q(uint16_t  *dst, uint16x8x4_t  vec) { vst4q_u16(dst, vec); }
+static inline void vst4q(int32_t   *dst, int32x4x4_t   vec) { vst4q_s32(dst, vec); }
+static inline void vst4q(uint32_t  *dst, uint32x4x4_t  vec) { vst4q_u32(dst, vec); }
+static inline void vst4q(int64_t   *dst, int64x2x4_t   vec) { vst4q_s64(dst, vec); }
+static inline void vst4q(uint64_t  *dst, uint64x2x4_t  vec) { vst4q_u64(dst, vec); }
+static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); }
+
+static inline void vst1q_x2(int8_t    *dst, int8x16x2_t   vec) { vst1q_s8_x2(dst, vec); }
+static inline void vst1q_x2(uint8_t   *dst, uint8x16x2_t  vec) { vst1q_u8_x2(dst, vec); }
+static inline void vst1q_x2(int16_t   *dst, int16x8x2_t   vec) { vst1q_s16_x2(dst, vec); }
+static inline void vst1q_x2(uint16_t  *dst, uint16x8x2_t  vec) { vst1q_u16_x2(dst, vec); }
+static inline void vst1q_x2(int32_t   *dst, int32x4x2_t   vec) { vst1q_s32_x2(dst, vec); }
+static inline void vst1q_x2(uint32_t  *dst, uint32x4x2_t  vec) { vst1q_u32_x2(dst, vec); }
+static inline void vst1q_x2(int64_t   *dst, int64x2x2_t   vec) { vst1q_s64_x2(dst, vec); }
+static inline void vst1q_x2(uint64_t  *dst, uint64x2x2_t  vec) { vst1q_u64_x2(dst, vec); }
+static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { vst1q_f32_x2(dst, vec); }
+
+static inline void vst1q_x4(int8_t    *dst, int8x16x4_t   vec) { vst1q_s8_x4(dst, vec); }
+static inline void vst1q_x4(uint8_t   *dst, uint8x16x4_t  vec) { vst1q_u8_x4(dst, vec); }
+static inline void vst1q_x4(int16_t   *dst, int16x8x4_t   vec) { vst1q_s16_x4(dst, vec); }
+static inline void vst1q_x4(uint16_t  *dst, uint16x8x4_t  vec) { vst1q_u16_x4(dst, vec); }
+static inline void vst1q_x4(int32_t   *dst, int32x4x4_t   vec) { vst1q_s32_x4(dst, vec); }
+static inline void vst1q_x4(uint32_t  *dst, uint32x4x4_t  vec) { vst1q_u32_x4(dst, vec); }
+static inline void vst1q_x4(int64_t   *dst, int64x2x4_t   vec) { vst1q_s64_x4(dst, vec); }
+static inline void vst1q_x4(uint64_t  *dst, uint64x2x4_t  vec) { vst1q_u64_x4(dst, vec); }
+static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { vst1q_f32_x4(dst, vec); }
 
 // -----------------------------------------------------------------------------
 // vreinterpret*
-- 
GitLab


From 93014a9696d65f4509c10268ae655ff3e5a236e0 Mon Sep 17 00:00:00 2001
From: Mark Horvath <mark.horvath@arm.com>
Date: Mon, 15 Apr 2024 15:52:02 +0200
Subject: [PATCH 2/4] Add NEON implementation and tests for exp_f32

---
 CHANGELOG.md                          |   1 +
 adapters/opencv/kleidicv_hal.cpp      |   5 +
 adapters/opencv/kleidicv_hal.h        |   5 +
 conformity/opencv/CMakeLists.txt      |   2 +
 conformity/opencv/test_exp.cpp        |  54 +++++++++
 conformity/opencv/test_exp.h          |  14 +++
 conformity/opencv/tests.cpp           |   2 +
 conformity/opencv/tests.h             |  36 +++++-
 doc/functionality.md                  |  21 ++--
 doc/opencv.md                         |   3 +
 kleidicv/include/kleidicv/kleidicv.h  |  26 +++++
 kleidicv/src/arithmetics/exp_api.cpp  |  25 ++++
 kleidicv/src/arithmetics/exp_neon.cpp | 101 ++++++++++++++++
 scripts/ci-opencv.sh                  |   1 +
 test/api/test_exp.cpp                 | 162 ++++++++++++++++++++++++++
 15 files changed, 446 insertions(+), 12 deletions(-)
 create mode 100644 conformity/opencv/test_exp.cpp
 create mode 100644 conformity/opencv/test_exp.h
 create mode 100644 kleidicv/src/arithmetics/exp_api.cpp
 create mode 100644 kleidicv/src/arithmetics/exp_neon.cpp
 create mode 100644 test/api/test_exp.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 831a0eb85..7c04ee50f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ This changelog aims to follow the guiding principles of
 ## 0.2.0 - not yet released
 
 ### Added
+- Exponential function for float.
 
 ### Fixed
 
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index b94c451e0..380931a3c 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -706,4 +706,9 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
+int exp32f(const float *src, float *dst, int len) {
+  return convert_error(kleidicv_exp_f32(src, len * sizeof(float), dst,
+                                        len * sizeof(float), len, 1));
+}
+
 }  // namespace kleidicv::hal
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index a25be7719..62b96c7c0 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -97,9 +97,14 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
               uchar *dst_data, size_t dst_step, int dst_depth, int width,
               int height, double scale, double shift);
 
+int exp32f(const float *src, float *dst, int len);
+
 }  // namespace hal
 }  // namespace kleidicv
 
+#undef cv_hal_exp32f
+#define cv_hal_exp32f kleidicv::hal::exp32f
+
 // Other HAL implementations might require the cv namespace
 namespace cv {
 
diff --git a/conformity/opencv/CMakeLists.txt b/conformity/opencv/CMakeLists.txt
index 06912c661..88a80ff36 100644
--- a/conformity/opencv/CMakeLists.txt
+++ b/conformity/opencv/CMakeLists.txt
@@ -34,6 +34,7 @@ add_executable(
   test_min_max.cpp
   test_rgb2yuv.cpp
   test_sobel.cpp
+  test_exp.cpp
 )
 
 target_link_libraries(
@@ -72,6 +73,7 @@ add_executable(
   test_min_max.cpp
   test_rgb2yuv.cpp
   test_sobel.cpp
+  test_exp.cpp
 )
 
 target_link_libraries(
diff --git a/conformity/opencv/test_exp.cpp b/conformity/opencv/test_exp.cpp
new file mode 100644
index 000000000..8c59be8f2
--- /dev/null
+++ b/conformity/opencv/test_exp.cpp
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "test_exp.h"
+
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "opencv2/core/hal/interface.h"
+
+static cv::Mat exec_exp(cv::Mat& input_mat) {
+  cv::Mat result;
+  cv::exp(input_mat, result);
+  return result;
+}
+
+#if MANAGER
+template <size_t Channels>
+bool test_exp(int index, RecreatedMessageQueue& request_queue,
+              RecreatedMessageQueue& reply_queue) {
+  cv::RNG rng(0);
+
+  for (size_t x = 5; x <= 16; ++x) {
+    for (size_t y = 5; y <= 16; ++y) {
+      cv::Mat input_mat(x, y, CV_32FC(Channels));
+      // Use inputs where results are not flowing over or under
+      rng.fill(input_mat, cv::RNG::UNIFORM, -80.0, 80.0);
+      cv::Mat actual_mat = exec_exp(input_mat);
+      cv::Mat expected_mat = get_expected_from_subordinate(
+          index, request_queue, reply_queue, input_mat);
+
+      if (are_float_matrices_different<float>(1.5, expected_mat, actual_mat)) {
+        fail_print_matrices(x, y, input_mat, actual_mat, expected_mat);
+      }
+    }
+  }
+
+  return false;
+}
+#endif
+
+std::vector<test>& exp_tests_get() {
+  // clang-format off
+  static std::vector<test> tests = {
+    TEST("Exp float, 1 channel", (test_exp<1>), exec_exp),
+    TEST("Exp float, 2 channel", (test_exp<2>), exec_exp),
+    TEST("Exp float, 3 channel", (test_exp<3>), exec_exp),
+    TEST("Exp float, 4 channel", (test_exp<4>), exec_exp),
+  };
+  // clang-format on
+  return tests;
+}
diff --git a/conformity/opencv/test_exp.h b/conformity/opencv/test_exp.h
new file mode 100644
index 000000000..1da690695
--- /dev/null
+++ b/conformity/opencv/test_exp.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_CONFORMITY_TEST_EXP_H_
+#define KLEIDICV_OPENCV_CONFORMITY_TEST_EXP_H_
+
+#include <vector>
+
+#include "tests.h"
+
+std::vector<test>& exp_tests_get();
+
+#endif  // KLEIDICV_OPENCV_CONFORMITY_TEST_EXP_H_
diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp
index 257f00d7b..657d5551e 100644
--- a/conformity/opencv/tests.cpp
+++ b/conformity/opencv/tests.cpp
@@ -10,6 +10,7 @@
 
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
+#include "test_exp.h"
 #include "test_gaussian_blur.h"
 #include "test_min_max.h"
 #include "test_rgb2yuv.h"
@@ -30,6 +31,7 @@ std::vector<test> all_tests = merge_tests({
     min_max_tests_get,
     rgb2yuv_tests_get,
     sobel_tests_get,
+    exp_tests_get,
 });
 
 #if MANAGER
diff --git a/conformity/opencv/tests.h b/conformity/opencv/tests.h
index 073ee6999..cce5aefc4 100644
--- a/conformity/opencv/tests.h
+++ b/conformity/opencv/tests.h
@@ -17,13 +17,45 @@ static auto abs_diff(T a, T b) {
   return a > b ? a - b : b - a;
 }
 
-template <typename T>
-bool are_matrices_different(T threshold, cv::Mat& A, cv::Mat& B) {
+static inline bool check_matrix_size_and_type(cv::Mat& A, cv::Mat& B) {
   if (A.rows != B.rows || A.cols != B.cols || A.type() != B.type()) {
     std::cout << "Matrix size/type mismatch" << std::endl;
     return true;
   }
 
+  return false;
+}
+
+// Expected matrix should not contain zeros
+template <typename T>
+bool are_float_matrices_different(T threshold_percent, cv::Mat& exp,
+                                  cv::Mat& act) {
+  if (check_matrix_size_and_type(exp, act)) {
+    return true;
+  }
+
+  for (int i = 0; i < exp.rows; ++i) {
+    for (int j = 0; j < (exp.cols * CV_MAT_CN(exp.type())); ++j) {
+      T diff = abs_diff<T>(exp.at<T>(i, j), act.at<T>(i, j));
+      T diff_percentage = (diff / std::abs(exp.at<T>(i, j))) * 100;
+      if (diff_percentage > threshold_percent) {
+        std::cout << "=== Mismatch at: " << i << " " << j << std::endl
+                  << "Relative diff: " << diff_percentage << std::endl
+                  << std::endl;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+template <typename T>
+bool are_matrices_different(T threshold, cv::Mat& A, cv::Mat& B) {
+  if (check_matrix_size_and_type(A, B)) {
+    return true;
+  }
+
   for (int i = 0; i < A.rows; ++i) {
     for (int j = 0; j < (A.cols * CV_MAT_CN(A.type())); ++j) {
       if (abs_diff<T>(A.at<T>(i, j), B.at<T>(i, j)) > threshold) {
diff --git a/doc/functionality.md b/doc/functionality.md
index a9d7c45c0..0edc0259a 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -8,16 +8,17 @@ SPDX-License-Identifier: Apache-2.0
 Note: functions listed here are not necessarily exposed to adapter API layer.
 See `doc/opencv.md` for details of the functionality available in OpenCV.
 
-## Basic arithmetic operations
-|                              | s8  | u8  | s16 | u16 | s32 | u32 | s64 | u64 |
-|------------------------------|-----|-----|-----|-----|-----|-----|-----|-----|
-| Saturating Add               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |
-| Saturating Sub               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |
-| Saturating Absdiff           |  x  |  x  |  x  |  x  |  x  |     |     |     |
-| Saturating Multiply          |  x  |  x  |  x  |  x  |  x  |     |     |     |
-| Threshold binary             |     |  x  |     |     |     |     |     |     |
-| SaturatingAddAbsWithThreshold|     |     |  x  |     |     |     |     |     |
-| Scale                        |     |  x  |     |     |     |     |     |     |
+## Arithmetic operations
+|                              | s8  | u8  | s16 | u16 | s32 | u32 | s64 | u64 | f32 | f64 |
+|------------------------------|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|
+| Exp                          |     |     |     |     |     |     |     |     |  x  |     |
+| Saturating Add               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |     |     |
+| Saturating Sub               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |     |     |
+| Saturating Absdiff           |  x  |  x  |  x  |  x  |  x  |     |     |     |     |     |
+| Saturating Multiply          |  x  |  x  |  x  |  x  |  x  |     |     |     |     |     |
+| Threshold binary             |     |  x  |     |     |     |     |     |     |     |     |
+| SaturatingAddAbsWithThreshold|     |     |  x  |     |     |     |     |     |     |     |
+| Scale                        |     |  x  |     |     |     |     |     |     |     |     |
 
 ## Color conversions
 |           | u8  |
diff --git a/doc/opencv.md b/doc/opencv.md
index 8dccc17fa..99e287c56 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -168,3 +168,6 @@ Notes on parameters:
 
 ### `convertTo`
 Currently converting to different data types is not supported. This function scales given input of `src_depth == CV_8U` using `scale` and `shift`.
+
+### `exp`
+Exponential function. Currently only `CV_32F` type is supported.
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index cb0381c23..2dfbd8e9c 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1306,6 +1306,32 @@ KLEIDICV_API_DECLARATION(kleidicv_scale_u8, const uint8_t *src,
                          size_t src_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, float scale, float shift);
 
+/// Exponential function, input is the elements in `src`, output is the elements
+/// in `dst`.
+///
+/// In case of 'float' type the maximum error is 0.36565+0.5 ULP, or the error
+/// of the toolchains's expf implementation, if it is bigger.
+///
+/// Source and destination data length is `width` * `height`. Number of elements
+/// is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS.
+///
+/// @param src          Pointer to the source data. Must be non-null.
+/// @param src_stride   Distance in bytes from the start of one row to the
+///                     start of the next row for the source data. Must
+///                     not be less than width * sizeof(type).
+///                     Must be a multiple of sizeof(type).
+/// @param dst          Pointer to the destination data. Must be non-null.
+/// @param dst_stride   Distance in bytes from the start of one row to the
+///                     start of the next row for the destination data. Must
+///                     not be less than width * sizeof(type).
+///                     Must be a multiple of sizeof(type).
+/// @param width        Number of pixels in a row.
+/// @param height       Number of rows in the data.
+///
+KLEIDICV_API_DECLARATION(kleidicv_exp_f32, const float *src, size_t src_stride,
+                         float *dst, size_t dst_stride, size_t width,
+                         size_t height);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/kleidicv/src/arithmetics/exp_api.cpp b/kleidicv/src/arithmetics/exp_api.cpp
new file mode 100644
index 000000000..39c492719
--- /dev/null
+++ b/kleidicv/src/arithmetics/exp_api.cpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/dispatch.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/types.h"
+
+namespace kleidicv {
+
+namespace neon {
+
+template <typename T>
+kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride,
+                     size_t width, size_t height);
+
+}  // namespace neon
+
+}  // namespace kleidicv
+
+#define KLEIDICV_DEFINE_C_API(name, type)                                \
+  KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::exp<type>, nullptr, \
+                              nullptr)
+
+KLEIDICV_DEFINE_C_API(kleidicv_exp_f32, float);
diff --git a/kleidicv/src/arithmetics/exp_neon.cpp b/kleidicv/src/arithmetics/exp_neon.cpp
new file mode 100644
index 000000000..541424793
--- /dev/null
+++ b/kleidicv/src/arithmetics/exp_neon.cpp
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cmath>
+
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/neon.h"
+
+namespace kleidicv::neon {
+
+template <typename ScalarType>
+class Exp;
+
+template <>
+class Exp<float> final : public UnrollOnce {
+ public:
+  using VecTraits = neon::VecTraits<float>;
+  using VectorType = typename VecTraits::VectorType;
+
+  VectorType vector_path(VectorType src) {
+    float32x4_t n, r, scale, poly, absn, z;
+    uint32x4_t cmp, e;
+
+    /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+    z = vfmaq_f32(vdupq_n(kShift), src, vdupq_n(kInvLn2));
+    n = z - vdupq_n(kShift);
+    r = vfmaq_f32(src, n, vdupq_n(-kLn2Hi));
+    r = vfmaq_f32(r, n, vdupq_n(-kLn2Lo));
+    e = vreinterpretq_u32_f32(z) << 23;
+    scale = vreinterpretq_f32_u32(e + vdupq_n(0x3f800000));
+    absn = vabsq_f32(n);
+    cmp = absn > vdupq_n(126.0F);
+    poly = vfmaq_f32(vdupq_n(kPoly[1]), vdupq_n(kPoly[0]), r);
+    poly = vfmaq_f32(vdupq_n(kPoly[2]), poly, r);
+    poly = vfmaq_f32(vdupq_n(kPoly[3]), poly, r);
+    poly = vfmaq_f32(vdupq_n(kPoly[4]), poly, r);
+    poly = vfmaq_f32(vdupq_n(1.0F), poly, r);
+    poly = vfmaq_f32(vdupq_n(1.0F), poly, r);
+    if (KLEIDICV_UNLIKELY(v_any_u32(cmp))) {
+      return specialcase(poly, n, e, absn);
+    }
+    return scale * poly;
+  }
+
+  float scalar_path(float src) { return expf(src); }
+
+ private:
+  static int v_any_u32(uint32x4_t x) {
+    /* assume elements in x are either 0 or -1u.  */
+    return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0;
+  }
+
+  static float32x4_t specialcase(float32x4_t poly, float32x4_t n, uint32x4_t e,
+                                 float32x4_t absn) {
+    /* 2^n may overflow, break it up into s1*s2.  */
+    uint32x4_t b = (n <= vdupq_n(0.0F)) & vdupq_n(0x83000000);
+    float32x4_t s1 = vreinterpretq_f32_u32(vdupq_n(0x7f000000) + b);
+    float32x4_t s2 = vreinterpretq_f32_u32(e - b);
+    uint32x4_t cmp = absn > vdupq_n(192.0F);
+    float32x4_t r1 = s1 * s1;
+    float32x4_t r0 = poly * s1 * s2;
+    return vreinterpretq_f32_u32((cmp & vreinterpretq_u32_f32(r1)) |
+                                 (~cmp & vreinterpretq_u32_f32(r0)));
+  }
+
+  static constexpr float kShift = 0x1.8p23F;
+  static constexpr float kInvLn2 = 0x1.715476p+0F;
+  static constexpr float kLn2Hi = 0x1.62e4p-1F;
+  static constexpr float kLn2Lo = 0x1.7f7d1cp-20F;
+  static constexpr float kPoly[] = {
+      /*  maxerr: 0.36565 +0.5 ulp.  */
+      0x1.6a6000p-10F, 0x1.12718ep-7F, 0x1.555af0p-5F,
+      0x1.555430p-3F,  0x1.fffff4p-2F,
+  };
+};  // end of class Exp<float>
+
+template <typename T>
+kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride,
+                     size_t width, size_t height) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride);
+  CHECK_IMAGE_SIZE(width, height);
+
+  Exp<T> operation;
+  Rectangle rect{width, height};
+  Rows<const T> src_rows{src, src_stride};
+  Rows<T> dst_rows{dst, dst_stride};
+  apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  return KLEIDICV_OK;
+}
+
+#define KLEIDICV_INSTANTIATE_TEMPLATE(type)                             \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp<type>(         \
+      const type* src, size_t src_stride, type* dst, size_t dst_stride, \
+      size_t width, size_t height)
+
+KLEIDICV_INSTANTIATE_TEMPLATE(float);
+
+}  // namespace kleidicv::neon
diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh
index a0982daff..00ab6b07b 100755
--- a/scripts/ci-opencv.sh
+++ b/scripts/ci-opencv.sh
@@ -54,6 +54,7 @@ CORE_TEST_PATTERNS=(
   '*Core_Transpose*'
   '*Core_MinMaxLoc*'
   '*Core_ConvertScale*'
+  '*Core_Exp*'
 )
 CORE_TEST_PATTERNS_STR="$(join_strings_with_colon "${CORE_TEST_PATTERNS[*]}")"
 ../../../conformity/opencv_kleidicv/bin/opencv_test_core \
diff --git a/test/api/test_exp.cpp b/test/api/test_exp.cpp
new file mode 100644
index 000000000..158cfcf54
--- /dev/null
+++ b/test/api/test_exp.cpp
@@ -0,0 +1,162 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "framework/generator.h"
+#include "framework/operation.h"
+#include "framework/utils.h"
+#include "kleidicv/kleidicv.h"
+
+#define KLEIDICV_EXP(type, suffix) \
+  KLEIDICV_API(exp, kleidicv_exp_##suffix, type)
+
+KLEIDICV_EXP(float, f32);
+
+static void check_1ulp_error(test::Array2D<float>& expected_array,
+                             test::Array2D<float>& actual_array) {
+  for (size_t i = 0; i < expected_array.height(); ++i) {
+    for (size_t j = 0; j < expected_array.width(); ++j) {
+      float expected = *(expected_array.at(i, j));
+      float actual = *(actual_array.at(i, j));
+      // Error of 1 ULP means that actual is either same as expected, or
+      // the next float value in negative of positive direction
+      EXPECT_EQ(std::nextafterf(actual, expected), expected);
+    }
+  }
+}
+
+template <typename ElementType>
+class ExpTestSpecial;
+
+template <>
+class ExpTestSpecial<float> final : public UnaryOperationTest<float> {
+  using ElementType = float;
+  using Elements = typename UnaryOperationTest<ElementType>::Elements;
+
+ public:
+  ExpTestSpecial() : test_elements_(input_values().size()) {
+    auto inputs = input_values();
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      test_elements_[i].values[0] = inputs[i];
+      // Expected values calculated as doubles to have 'perfect' references.
+      // As the NEON implementation reuses the toolchain's expf implementation
+      // for the tail path, the test expects that the error for expf is also
+      // less than 1 ULP.
+      test_elements_[i].values[1] =
+          static_cast<float>(exp(static_cast<double>(inputs[i])));
+    }
+  }
+
+ private:
+  static const std::vector<ElementType>& input_values() {
+    static const std::vector<ElementType> kInputValues = {
+        -105.31, -100.07, -81.012, -47.66, -3.1088, -0.21,
+        0.7,     6.2,     39.7201, 86.11,  88.947};
+
+    return kInputValues;
+  }
+
+  kleidicv_error_t call_api() override {
+    return exp<ElementType>()(
+        this->inputs_[0].data(), this->inputs_[0].stride(),
+        this->actual_[0].data(), this->actual_[0].stride(), this->width(),
+        this->height());
+  }
+
+  void check(kleidicv_error_t err) override {
+    EXPECT_EQ(KLEIDICV_OK, err);
+    check_1ulp_error(this->expected_[0], this->actual_[0]);
+  }
+
+  const std::vector<Elements>& test_elements() override {
+    return test_elements_;
+  }
+
+  void setup() override {
+    // Default input value is 0.0, so the default expected value needs to be set
+    // to 1.0
+    ElementType expected = 1.0;
+    this->expected_[0].fill(expected);
+    UnaryOperationTest<ElementType>::setup();
+  }
+
+  std::vector<Elements> test_elements_;
+};  // end of class ExpTestSpecial<float>
+
+template <typename ElementType>
+class ExpTestRandom;
+
+template <>
+class ExpTestRandom<float> {
+ public:
+  void test() {
+    const size_t kWidth = test::Options::vector_length() * 16;
+    constexpr size_t kHeight = 16;
+    test::PseudoRandomNumberGenerator<float> generator;
+    test::Array2D<float> input{kWidth, kHeight};
+    test::Array2D<float> expected{kWidth, kHeight};
+    test::Array2D<float> actual{kWidth, kHeight};
+    input.fill(generator);
+
+    fill_expected(input, expected);
+
+    EXPECT_EQ(KLEIDICV_OK,
+              exp<float>()(input.data(), input.stride(), actual.data(),
+                           actual.stride(), input.width(), input.height()));
+
+    check_1ulp_error(expected, actual);
+  }
+
+ private:
+  void fill_expected(test::Array2D<float>& input,
+                     test::Array2D<float>& expected) {
+    for (size_t i = 0; i < input.height(); ++i) {
+      for (size_t j = 0; j < input.width(); ++j) {
+        // Expected values calculated as doubles to have 'perfect' references.
+        // As the NEON implementation reuses the toolchain's expf implementation
+        // for the tail path, the test expects that the error for expf is also
+        // less than 1 ULP.
+        // NOLINTBEGIN(clang-analyzer-core.CallAndMessage)
+        *(expected.at(i, j)) =
+            static_cast<float>(exp(static_cast<double>(*(input.at(i, j)))));
+        // NOLINTEND(clang-analyzer-core.CallAndMessage)
+      }
+    }
+  }
+};  // end of class ExpTestRandom<float>
+
+template <typename ElementType>
+class Exp : public testing::Test {};
+using ElementTypes = ::testing::Types<float>;
+TYPED_TEST_SUITE(Exp, ElementTypes);
+
+TYPED_TEST(Exp, SpecialValues) {
+  ExpTestSpecial<TypeParam>{}.test();
+  ExpTestSpecial<TypeParam>{}
+      .with_padding(test::Options::vector_length())
+      .test();
+}
+
+TYPED_TEST(Exp, RandomValues) { ExpTestRandom<TypeParam>{}.test(); }
+
+TYPED_TEST(Exp, OversizeImage) {
+  TypeParam src[1] = {}, dst[1];
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            exp<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
+                             KLEIDICV_MAX_IMAGE_PIXELS + 1, 1));
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      exp<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
+                       KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS));
+}
+
+TYPED_TEST(Exp, NullPointers) {
+  TypeParam src[1] = {}, dst[1];
+  test::test_null_args(exp<TypeParam>(), src, sizeof(TypeParam), dst,
+                       sizeof(TypeParam), 1, 1);
+}
-- 
GitLab


From 71a0c2cebdb30d0d2cf0104fd580afd1c92c04bb Mon Sep 17 00:00:00 2001
From: Mark Horvath <mark.horvath@arm.com>
Date: Tue, 28 May 2024 09:25:37 +0200
Subject: [PATCH 3/4] Add long-running-tests option for the API test

This option can be used to skip long running tests by default, but run
them if explicitly stated on the CLI of the test executables.
---
 test/framework/test_main.cpp |  5 +++++
 test/framework/utils.cpp     |  2 ++
 test/framework/utils.h       | 12 ++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/test/framework/test_main.cpp b/test/framework/test_main.cpp
index a4d962e02..4c71daa45 100644
--- a/test/framework/test_main.cpp
+++ b/test/framework/test_main.cpp
@@ -22,6 +22,7 @@ static void parse_arguments(int argc, char **argv) {
   static struct option long_options[] = {
     {"vector-length", required_argument, nullptr, 'v'},
     {"seed", required_argument, nullptr, 's'},
+    {"long-running-tests", no_argument, nullptr, 'l'},
     {nullptr, 0, nullptr, 0}
   };
   // clang-format on
@@ -48,6 +49,10 @@ static void parse_arguments(int argc, char **argv) {
         Options::set_seed(std::stoull(optarg));
         is_seed_set = true;
         break;
+
+      case 'l':
+        Options::turn_on_long_running_tests();
+        break;
     }
   }
 
diff --git a/test/framework/utils.cpp b/test/framework/utils.cpp
index a88f42a11..bd090d73a 100644
--- a/test/framework/utils.cpp
+++ b/test/framework/utils.cpp
@@ -16,6 +16,8 @@
 
 bool MockMallocToFail::enabled = false;
 
+bool test::Options::are_long_running_tests_skipped_ = true;
+
 namespace test {
 
 template <typename ElementType>
diff --git a/test/framework/utils.h b/test/framework/utils.h
index 9ee2ea621..e5f0e46b0 100644
--- a/test/framework/utils.h
+++ b/test/framework/utils.h
@@ -67,6 +67,11 @@ class Options {
   // Returns seed to use.
   static uint64_t seed() { return seed_; }
 
+  // Whether long running tests should be skipped.
+  static bool are_long_running_tests_skipped() {
+    return are_long_running_tests_skipped_;
+  }
+
   // Returns the number of lanes in a vector for a given arithmetic type.
   template <typename ElementType,
             std::enable_if_t<std::is_arithmetic_v<ElementType>, bool> = true>
@@ -89,11 +94,18 @@ class Options {
   // Sets the seed.
   static void set_seed(uint64_t value) { seed_ = value; }
 
+  // Turns on long running tests.
+  static void turn_on_long_running_tests() {
+    are_long_running_tests_skipped_ = false;
+  }
+
  private:
   // Vector length being tested.
   static size_t vector_length_;
   // Seed to use.
   static uint64_t seed_;
+  // Whether long running tests should be skipped.
+  static bool are_long_running_tests_skipped_;
 };  // end of class Options
 
 // Prints all the elements in a two-dimensional space.
-- 
GitLab


From 4187e8fdc5e67d3f5a3bd0c49c011accfd78f8de Mon Sep 17 00:00:00 2001
From: Mark Horvath <mark.horvath@arm.com>
Date: Tue, 28 May 2024 09:30:34 +0200
Subject: [PATCH 4/4] Test exp_f32 for all sensible input values

The stated error range is tested for all input values where the result
is not 0.0 or inf.
This is a long running tests, so it is skipped by default.
---
 adapters/opencv/kleidicv_hal.h       |  11 ++-
 conformity/opencv/test_exp.cpp       |   2 +
 kleidicv/include/kleidicv/kleidicv.h |   2 +-
 test/api/test_exp.cpp                | 109 +++++++++++++++++++++++----
 4 files changed, 104 insertions(+), 20 deletions(-)

diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 62b96c7c0..25f1b9587 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -102,9 +102,6 @@ int exp32f(const float *src, float *dst, int len);
 }  // namespace hal
 }  // namespace kleidicv
 
-#undef cv_hal_exp32f
-#define cv_hal_exp32f kleidicv::hal::exp32f
-
 // Other HAL implementations might require the cv namespace
 namespace cv {
 
@@ -318,6 +315,14 @@ static inline int kleidicv_convertTo_with_fallback(
 #define cv_hal_convertTo kleidicv_convertTo_with_fallback
 #endif  // defined(cv_hal_convertTo)
 
+// exp32f
+static inline int kleidicv_exp32f_with_fallback(const float *src, float *dst,
+                                                int len) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(exp32f, cv_hal_exp32f, src, dst, len);
+}
+#undef cv_hal_exp32f
+#define cv_hal_exp32f kleidicv_exp32f_with_fallback
+
 #endif  // OPENCV_CORE_HAL_REPLACEMENT_HPP
 
 // Remove no longer needed macro definitions.
diff --git a/conformity/opencv/test_exp.cpp b/conformity/opencv/test_exp.cpp
index 8c59be8f2..eb97c96ac 100644
--- a/conformity/opencv/test_exp.cpp
+++ b/conformity/opencv/test_exp.cpp
@@ -31,6 +31,8 @@ bool test_exp(int index, RecreatedMessageQueue& request_queue,
       cv::Mat expected_mat = get_expected_from_subordinate(
           index, request_queue, reply_queue, input_mat);
 
+      // OpenCV works with less precision, so a relatively big expected error
+      // range is defined
       if (are_float_matrices_different<float>(1.5, expected_mat, actual_mat)) {
         fail_print_matrices(x, y, input_mat, actual_mat, expected_mat);
       }
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index 2dfbd8e9c..cb6fe639a 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1310,7 +1310,7 @@ KLEIDICV_API_DECLARATION(kleidicv_scale_u8, const uint8_t *src,
 /// in `dst`.
 ///
 /// In case of 'float' type the maximum error is 0.36565+0.5 ULP, or the error
-/// of the toolchains's expf implementation, if it is bigger.
+/// of the toolchain's expf implementation, if it is bigger.
 ///
 /// Source and destination data length is `width` * `height`. Number of elements
 /// is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS.
diff --git a/test/api/test_exp.cpp b/test/api/test_exp.cpp
index 158cfcf54..9347b21c1 100644
--- a/test/api/test_exp.cpp
+++ b/test/api/test_exp.cpp
@@ -20,7 +20,12 @@ static void check_1ulp_error(test::Array2D<float>& expected_array,
                              test::Array2D<float>& actual_array) {
   for (size_t i = 0; i < expected_array.height(); ++i) {
     for (size_t j = 0; j < expected_array.width(); ++j) {
+      // Seems like clang-tidy does not understand what the fill member function
+      // of test::Array2D does, so these exceptions are required
+      // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign)
       float expected = *(expected_array.at(i, j));
+      // NOLINTEND(clang-analyzer-core.uninitialized.Assign)
+
       float actual = *(actual_array.at(i, j));
       // Error of 1 ULP means that actual is either same as expected, or
       // the next float value in negative of positive direction
@@ -88,11 +93,37 @@ class ExpTestSpecial<float> final : public UnaryOperationTest<float> {
   std::vector<Elements> test_elements_;
 };  // end of class ExpTestSpecial<float>
 
+template <typename ElementType>
+class ExpTestCustomBase;
+
+template <>
+class ExpTestCustomBase<float> {
+ protected:
+  static void fill_expected(test::Array2D<float>& input,
+                            test::Array2D<float>& expected) {
+    for (size_t i = 0; i < input.height(); ++i) {
+      for (size_t j = 0; j < input.width(); ++j) {
+        // Expected values calculated as doubles to have 'perfect' references.
+        // As the NEON implementation reuses the toolchain's expf implementation
+        // for the tail path, the test expects that the error for expf is also
+        // less than 1 ULP.
+
+        // Seems like clang-tidy does not understand what the fill member
+        // function of test::Array2D does, so these exceptions are required
+        // NOLINTBEGIN(clang-analyzer-core.CallAndMessage)
+        *(expected.at(i, j)) =
+            static_cast<float>(exp(static_cast<double>(*(input.at(i, j)))));
+        // NOLINTEND(clang-analyzer-core.CallAndMessage)
+      }
+    }
+  }
+};  // end of class ExpTestCustomBase<float>
+
 template <typename ElementType>
 class ExpTestRandom;
 
 template <>
-class ExpTestRandom<float> {
+class ExpTestRandom<float> final : public ExpTestCustomBase<float> {
  public:
   void test() {
     const size_t kWidth = test::Options::vector_length() * 16;
@@ -111,24 +142,63 @@ class ExpTestRandom<float> {
 
     check_1ulp_error(expected, actual);
   }
+};  // end of class ExpTestRandom<float>
 
- private:
-  void fill_expected(test::Array2D<float>& input,
-                     test::Array2D<float>& expected) {
-    for (size_t i = 0; i < input.height(); ++i) {
-      for (size_t j = 0; j < input.width(); ++j) {
-        // Expected values calculated as doubles to have 'perfect' references.
-        // As the NEON implementation reuses the toolchain's expf implementation
-        // for the tail path, the test expects that the error for expf is also
-        // less than 1 ULP.
-        // NOLINTBEGIN(clang-analyzer-core.CallAndMessage)
-        *(expected.at(i, j)) =
-            static_cast<float>(exp(static_cast<double>(*(input.at(i, j)))));
-        // NOLINTEND(clang-analyzer-core.CallAndMessage)
-      }
+template <typename ElementType>
+class ExpTestAll;
+
+template <>
+class ExpTestAll<float> final : public ExpTestCustomBase<float> {
+ public:
+  void test() {
+    constexpr size_t kWidth = 1024;
+    constexpr size_t kHeight = 1024;
+    // Sweeping through a meaningful input range. The start value results in
+    // 0.0, the last value in inf.
+    LinearFloatGenerator generator{-104, 89};
+    test::Array2D<float> input{kWidth, kHeight};
+    test::Array2D<float> expected{kWidth, kHeight};
+    test::Array2D<float> actual{kWidth, kHeight};
+
+    while (generator.last_value_not_reached()) {
+      input.fill(generator);
+
+      fill_expected(input, expected);
+
+      EXPECT_EQ(KLEIDICV_OK,
+                exp<float>()(input.data(), input.stride(), actual.data(),
+                             actual.stride(), input.width(), input.height()));
+
+      check_1ulp_error(expected, actual);
     }
   }
-};  // end of class ExpTestRandom<float>
+
+ private:
+  class LinearFloatGenerator : public test::Generator<float> {
+   public:
+    LinearFloatGenerator(float start_value, float last_value)
+        : start_value_{start_value},
+          last_value_{last_value},
+          value_{start_value} {}
+
+    void reset() override { value_ = start_value_; }
+
+    std::optional<float> next() override {
+      float current_value = value_;
+      value_ = std::nextafterf(value_, last_value_);
+      return current_value;
+    }
+
+    bool last_value_not_reached() const {
+      return std::nextafterf(value_, last_value_) != value_;
+    }
+
+   private:
+    float start_value_;
+    float last_value_;
+    float value_;
+  };  // end of class LinearFloatGenerator
+};    // end of class ExpTestAll<float>
 
 template <typename ElementType>
 class Exp : public testing::Test {};
@@ -144,6 +214,13 @@ TYPED_TEST(Exp, SpecialValues) {
 
 TYPED_TEST(Exp, RandomValues) { ExpTestRandom<TypeParam>{}.test(); }
 
+TYPED_TEST(Exp, AllValues) {
+  if (test::Options::are_long_running_tests_skipped()) {
+    GTEST_SKIP() << "Long running exp test skipped";
+  }
+  ExpTestAll<TypeParam>{}.test();
+}
+
 TYPED_TEST(Exp, OversizeImage) {
   TypeParam src[1] = {}, dst[1];
   EXPECT_EQ(KLEIDICV_ERROR_RANGE,
-- 
GitLab