diff --git a/CHANGELOG.md b/CHANGELOG.md
index 831a0eb8513a8fed1e853aa38829ee923bda3903..7c04ee50f3b7e61b5314a019ad25db1d265a2e91 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ This changelog aims to follow the guiding principles of
 ## 0.2.0 - not yet released
 
 ### Added
+- Exponential function for float.
 
 ### Fixed
 
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index b94c451e0a743b44758576a4722d5d38063aa697..380931a3cc3d334049c266735c62ee3006c343e0 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -706,4 +706,9 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
   return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
+int exp32f(const float *src, float *dst, int len) {
+  return convert_error(kleidicv_exp_f32(src, len * sizeof(float), dst,
+                                        len * sizeof(float), len, 1));
+}
+
 }  // namespace kleidicv::hal
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index a25be77194304c9dfe8896a75188837085c2dacc..25f1b9587c6dddb3eee33efb0323b85a3153ffa4 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -97,6 +97,8 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
               uchar *dst_data, size_t dst_step, int dst_depth, int width,
               int height, double scale, double shift);
 
+int exp32f(const float *src, float *dst, int len);
+
 }  // namespace hal
 }  // namespace kleidicv
 
@@ -313,6 +315,14 @@ static inline int kleidicv_convertTo_with_fallback(
 #define cv_hal_convertTo kleidicv_convertTo_with_fallback
 #endif  // defined(cv_hal_convertTo)
 
+// exp32f
+static inline int kleidicv_exp32f_with_fallback(const float *src, float *dst,
+                                                int len) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(exp32f, cv_hal_exp32f, src, dst, len);
+}
+#undef cv_hal_exp32f
+#define cv_hal_exp32f kleidicv_exp32f_with_fallback
+
 #endif  // OPENCV_CORE_HAL_REPLACEMENT_HPP
 
 // Remove no longer needed macro definitions.
diff --git a/conformity/opencv/CMakeLists.txt b/conformity/opencv/CMakeLists.txt
index 06912c661efd02d52d4f9fabf39644410e7691e6..88a80ff36d78da66b541b9f684638db21d96f7e1 100644
--- a/conformity/opencv/CMakeLists.txt
+++ b/conformity/opencv/CMakeLists.txt
@@ -34,6 +34,7 @@ add_executable(
   test_min_max.cpp
   test_rgb2yuv.cpp
   test_sobel.cpp
+  test_exp.cpp
 )
 
 target_link_libraries(
@@ -72,6 +73,7 @@ add_executable(
   test_min_max.cpp
   test_rgb2yuv.cpp
   test_sobel.cpp
+  test_exp.cpp
 )
 
 target_link_libraries(
diff --git a/conformity/opencv/test_exp.cpp b/conformity/opencv/test_exp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb97c96ac5d4c23758498efc0afdc1e83c2a0da8
--- /dev/null
+++ b/conformity/opencv/test_exp.cpp
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "test_exp.h"
+
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "opencv2/core/hal/interface.h"
+
+static cv::Mat exec_exp(cv::Mat& input_mat) {
+  cv::Mat result;
+  cv::exp(input_mat, result);
+  return result;
+}
+
+#if MANAGER
+template <size_t Channels>
+bool test_exp(int index, RecreatedMessageQueue& request_queue,
+              RecreatedMessageQueue& reply_queue) {
+  cv::RNG rng(0);
+
+  for (size_t x = 5; x <= 16; ++x) {
+    for (size_t y = 5; y <= 16; ++y) {
+      cv::Mat input_mat(x, y, CV_32FC(Channels));
+      // Use inputs where results are not flowing over or under
+      rng.fill(input_mat, cv::RNG::UNIFORM, -80.0, 80.0);
+      cv::Mat actual_mat = exec_exp(input_mat);
+      cv::Mat expected_mat = get_expected_from_subordinate(
+          index, request_queue, reply_queue, input_mat);
+
+      // OpenCV works with less precision, so a relatively big expected error
+      // range is defined
+      if (are_float_matrices_different<float>(1.5, expected_mat, actual_mat)) {
+        fail_print_matrices(x, y, input_mat, actual_mat, expected_mat);
+      }
+    }
+  }
+
+  return false;
+}
+#endif
+
+std::vector<test>& exp_tests_get() {
+  // clang-format off
+  static std::vector<test> tests = {
+    TEST("Exp float, 1 channel", (test_exp<1>), exec_exp),
+    TEST("Exp float, 2 channel", (test_exp<2>), exec_exp),
+    TEST("Exp float, 3 channel", (test_exp<3>), exec_exp),
+    TEST("Exp float, 4 channel", (test_exp<4>), exec_exp),
+  };
+  // clang-format on
+  return tests;
+}
diff --git a/conformity/opencv/test_exp.h b/conformity/opencv/test_exp.h
new file mode 100644
index 0000000000000000000000000000000000000000..1da69069511dbc4d129bf4526b34eb44e04559af
--- /dev/null
+++ b/conformity/opencv/test_exp.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_CONFORMITY_TEST_EXP_H_
+#define KLEIDICV_OPENCV_CONFORMITY_TEST_EXP_H_
+
+#include <vector>
+
+#include "tests.h"
+
+std::vector<test>& exp_tests_get();
+
+#endif  // KLEIDICV_OPENCV_CONFORMITY_TEST_EXP_H_
diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp
index 257f00d7b32b5b7d52179ced0a3dce80ef371c1d..657d5551e77f5e3da6e021a7c3ed6b5616ce6353 100644
--- a/conformity/opencv/tests.cpp
+++ b/conformity/opencv/tests.cpp
@@ -10,6 +10,7 @@
 
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
+#include "test_exp.h"
 #include "test_gaussian_blur.h"
 #include "test_min_max.h"
 #include "test_rgb2yuv.h"
@@ -30,6 +31,7 @@ std::vector<test> all_tests = merge_tests({
     min_max_tests_get,
     rgb2yuv_tests_get,
     sobel_tests_get,
+    exp_tests_get,
 });
 
 #if MANAGER
diff --git a/conformity/opencv/tests.h b/conformity/opencv/tests.h
index 073ee69997ac7549d72cd13d5e8a343d4fac72ef..cce5aefc46873029b6984591c514c6bdc2e0aea4 100644
--- a/conformity/opencv/tests.h
+++ b/conformity/opencv/tests.h
@@ -17,13 +17,45 @@ static auto abs_diff(T a, T b) {
   return a > b ? a - b : b - a;
 }
 
-template <typename T>
-bool are_matrices_different(T threshold, cv::Mat& A, cv::Mat& B) {
+static inline bool check_matrix_size_and_type(cv::Mat& A, cv::Mat& B) {
   if (A.rows != B.rows || A.cols != B.cols || A.type() != B.type()) {
     std::cout << "Matrix size/type mismatch" << std::endl;
     return true;
   }
 
+  return false;
+}
+
+// Expected matrix should not contain zeros
+template <typename T>
+bool are_float_matrices_different(T threshold_percent, cv::Mat& exp,
+                                  cv::Mat& act) {
+  if (check_matrix_size_and_type(exp, act)) {
+    return true;
+  }
+
+  for (int i = 0; i < exp.rows; ++i) {
+    for (int j = 0; j < (exp.cols * CV_MAT_CN(exp.type())); ++j) {
+      T diff = abs_diff<T>(exp.at<T>(i, j), act.at<T>(i, j));
+      T diff_percentage = (diff / std::abs(exp.at<T>(i, j))) * 100;
+      if (diff_percentage > threshold_percent) {
+        std::cout << "=== Mismatch at: " << i << " " << j << std::endl
+                  << "Relative diff: " << diff_percentage << std::endl
+                  << std::endl;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+template <typename T>
+bool are_matrices_different(T threshold, cv::Mat& A, cv::Mat& B) {
+  if (check_matrix_size_and_type(A, B)) {
+    return true;
+  }
+
   for (int i = 0; i < A.rows; ++i) {
     for (int j = 0; j < (A.cols * CV_MAT_CN(A.type())); ++j) {
       if (abs_diff<T>(A.at<T>(i, j), B.at<T>(i, j)) > threshold) {
diff --git a/doc/functionality.md b/doc/functionality.md
index a9d7c45c05ff6cf6587be31f6d2fda07a67569ee..0edc0259ad718c8f5d1bb6b4fa4945631b56f20d 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -8,16 +8,17 @@ SPDX-License-Identifier: Apache-2.0
 Note: functions listed here are not necessarily exposed to adapter API layer.
 See `doc/opencv.md` for details of the functionality available in OpenCV.
 
-## Basic arithmetic operations
-|                              | s8  | u8  | s16 | u16 | s32 | u32 | s64 | u64 |
-|------------------------------|-----|-----|-----|-----|-----|-----|-----|-----|
-| Saturating Add               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |
-| Saturating Sub               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |
-| Saturating Absdiff           |  x  |  x  |  x  |  x  |  x  |     |     |     |
-| Saturating Multiply          |  x  |  x  |  x  |  x  |  x  |     |     |     |
-| Threshold binary             |     |  x  |     |     |     |     |     |     |
-| SaturatingAddAbsWithThreshold|     |     |  x  |     |     |     |     |     |
-| Scale                        |     |  x  |     |     |     |     |     |     |
+## Arithmetic operations
+|                              | s8  | u8  | s16 | u16 | s32 | u32 | s64 | u64 | f32 | f64 |
+|------------------------------|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|
+| Exp                          |     |     |     |     |     |     |     |     |  x  |     |
+| Saturating Add               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |     |     |
+| Saturating Sub               |  x  |  x  |  x  |  x  |  x  |  x  |  x  |  x  |     |     |
+| Saturating Absdiff           |  x  |  x  |  x  |  x  |  x  |     |     |     |     |     |
+| Saturating Multiply          |  x  |  x  |  x  |  x  |  x  |     |     |     |     |     |
+| Threshold binary             |     |  x  |     |     |     |     |     |     |     |     |
+| SaturatingAddAbsWithThreshold|     |     |  x  |     |     |     |     |     |     |     |
+| Scale                        |     |  x  |     |     |     |     |     |     |     |     |
 
 ## Color conversions
 |           | u8  |
diff --git a/doc/opencv.md b/doc/opencv.md
index 8dccc17fa257eaf56dceebfc4c8fcbe0e741a54a..99e287c566a611e793b08bf4ef240a0ea2cc8ea9 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -168,3 +168,6 @@ Notes on parameters:
 
 ### `convertTo`
 Currently converting to different data types is not supported. This function scales given input of `src_depth == CV_8U` using `scale` and `shift`.
+
+### `exp`
+Exponential function. Currently only `CV_32F` type is supported.
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index cb0381c23b314a00ee4bec749f5fdfd39e92b44a..cb6fe639a86e42c87409ec6f955c71c7005835e2 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1306,6 +1306,32 @@ KLEIDICV_API_DECLARATION(kleidicv_scale_u8, const uint8_t *src,
                          size_t src_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, float scale, float shift);
 
+/// Exponential function, input is the elements in `src`, output is the elements
+/// in `dst`.
+///
+/// In case of 'float' type the maximum error is 0.36565+0.5 ULP, or the error
+/// of the toolchain's expf implementation, if it is bigger.
+///
+/// Source and destination data length is `width` * `height`. Number of elements
+/// is limited to @ref KLEIDICV_MAX_IMAGE_PIXELS.
+///
+/// @param src          Pointer to the source data. Must be non-null.
+/// @param src_stride   Distance in bytes from the start of one row to the
+///                     start of the next row for the source data. Must
+///                     not be less than width * sizeof(type).
+///                     Must be a multiple of sizeof(type).
+/// @param dst          Pointer to the destination data. Must be non-null.
+/// @param dst_stride   Distance in bytes from the start of one row to the
+///                     start of the next row for the destination data. Must
+///                     not be less than width * sizeof(type).
+///                     Must be a multiple of sizeof(type).
+/// @param width        Number of pixels in a row.
+/// @param height       Number of rows in the data.
+///
+KLEIDICV_API_DECLARATION(kleidicv_exp_f32, const float *src, size_t src_stride,
+                         float *dst, size_t dst_stride, size_t width,
+                         size_t height);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/kleidicv/include/kleidicv/neon_intrinsics.h b/kleidicv/include/kleidicv/neon_intrinsics.h
index 22ef062f101f4527f2a8c0654bd198c530b1cf19..ed11762fbeb7e7f5fc442a9252c9283620755086 100644
--- a/kleidicv/include/kleidicv/neon_intrinsics.h
+++ b/kleidicv/include/kleidicv/neon_intrinsics.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -343,59 +343,65 @@ static inline float32x4x4_t vld1q_x4(const float32_t *src) { return vld1q_f32_x4
 
 static inline void vst1(uint8_t *dst, uint8x8_t vec) { vst1_u8(dst, vec); }
 
-static inline void vst1q(int8_t   *dst, int8x16_t  vec) { vst1q_s8(dst, vec); }
-static inline void vst1q(uint8_t  *dst, uint8x16_t vec) { vst1q_u8(dst, vec); }
-static inline void vst1q(int16_t  *dst, int16x8_t  vec) { vst1q_s16(dst, vec); }
-static inline void vst1q(uint16_t *dst, uint16x8_t vec) { vst1q_u16(dst, vec); }
-static inline void vst1q(int32_t  *dst, int32x4_t  vec) { vst1q_s32(dst, vec); }
-static inline void vst1q(uint32_t *dst, uint32x4_t vec) { vst1q_u32(dst, vec); }
-static inline void vst1q(int64_t  *dst, int64x2_t  vec) { vst1q_s64(dst, vec); }
-static inline void vst1q(uint64_t *dst, uint64x2_t vec) { vst1q_u64(dst, vec); }
-
-static inline void vst2q(int8_t   *dst, int8x16x2_t  vec) { vst2q_s8(dst, vec); }
-static inline void vst2q(uint8_t  *dst, uint8x16x2_t vec) { vst2q_u8(dst, vec); }
-static inline void vst2q(int16_t  *dst, int16x8x2_t  vec) { vst2q_s16(dst, vec); }
-static inline void vst2q(uint16_t *dst, uint16x8x2_t vec) { vst2q_u16(dst, vec); }
-static inline void vst2q(int32_t  *dst, int32x4x2_t  vec) { vst2q_s32(dst, vec); }
-static inline void vst2q(uint32_t *dst, uint32x4x2_t vec) { vst2q_u32(dst, vec); }
-static inline void vst2q(int64_t  *dst, int64x2x2_t  vec) { vst2q_s64(dst, vec); }
-static inline void vst2q(uint64_t *dst, uint64x2x2_t vec) { vst2q_u64(dst, vec); }
-
-static inline void vst3q(int8_t   *dst, int8x16x3_t  vec) { vst3q_s8(dst, vec); }
-static inline void vst3q(uint8_t  *dst, uint8x16x3_t vec) { vst3q_u8(dst, vec); }
-static inline void vst3q(int16_t  *dst, int16x8x3_t  vec) { vst3q_s16(dst, vec); }
-static inline void vst3q(uint16_t *dst, uint16x8x3_t vec) { vst3q_u16(dst, vec); }
-static inline void vst3q(int32_t  *dst, int32x4x3_t  vec) { vst3q_s32(dst, vec); }
-static inline void vst3q(uint32_t *dst, uint32x4x3_t vec) { vst3q_u32(dst, vec); }
-static inline void vst3q(int64_t  *dst, int64x2x3_t  vec) { vst3q_s64(dst, vec); }
-static inline void vst3q(uint64_t *dst, uint64x2x3_t vec) { vst3q_u64(dst, vec); }
-
-static inline void vst4q(int8_t   *dst, int8x16x4_t  vec) { vst4q_s8(dst, vec); }
-static inline void vst4q(uint8_t  *dst, uint8x16x4_t vec) { vst4q_u8(dst, vec); }
-static inline void vst4q(int16_t  *dst, int16x8x4_t  vec) { vst4q_s16(dst, vec); }
-static inline void vst4q(uint16_t *dst, uint16x8x4_t vec) { vst4q_u16(dst, vec); }
-static inline void vst4q(int32_t  *dst, int32x4x4_t  vec) { vst4q_s32(dst, vec); }
-static inline void vst4q(uint32_t *dst, uint32x4x4_t vec) { vst4q_u32(dst, vec); }
-static inline void vst4q(int64_t  *dst, int64x2x4_t  vec) { vst4q_s64(dst, vec); }
-static inline void vst4q(uint64_t *dst, uint64x2x4_t vec) { vst4q_u64(dst, vec); }
-
-static inline void vst1q_x2(int8_t   *dst, int8x16x2_t  vec) { vst1q_s8_x2(dst, vec); }
-static inline void vst1q_x2(uint8_t  *dst, uint8x16x2_t vec) { vst1q_u8_x2(dst, vec); }
-static inline void vst1q_x2(int16_t  *dst, int16x8x2_t  vec) { vst1q_s16_x2(dst, vec); }
-static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) { vst1q_u16_x2(dst, vec); }
-static inline void vst1q_x2(int32_t  *dst, int32x4x2_t  vec) { vst1q_s32_x2(dst, vec); }
-static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) { vst1q_u32_x2(dst, vec); }
-static inline void vst1q_x2(int64_t  *dst, int64x2x2_t  vec) { vst1q_s64_x2(dst, vec); }
-static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) { vst1q_u64_x2(dst, vec); }
-
-static inline void vst1q_x4(int8_t   *dst, int8x16x4_t  vec) { vst1q_s8_x4(dst, vec); }
-static inline void vst1q_x4(uint8_t  *dst, uint8x16x4_t vec) { vst1q_u8_x4(dst, vec); }
-static inline void vst1q_x4(int16_t  *dst, int16x8x4_t  vec) { vst1q_s16_x4(dst, vec); }
-static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) { vst1q_u16_x4(dst, vec); }
-static inline void vst1q_x4(int32_t  *dst, int32x4x4_t  vec) { vst1q_s32_x4(dst, vec); }
-static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) { vst1q_u32_x4(dst, vec); }
-static inline void vst1q_x4(int64_t  *dst, int64x2x4_t  vec) { vst1q_s64_x4(dst, vec); }
-static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) { vst1q_u64_x4(dst, vec); }
+static inline void vst1q(int8_t    *dst, int8x16_t   vec) { vst1q_s8(dst, vec); }
+static inline void vst1q(uint8_t   *dst, uint8x16_t  vec) { vst1q_u8(dst, vec); }
+static inline void vst1q(int16_t   *dst, int16x8_t   vec) { vst1q_s16(dst, vec); }
+static inline void vst1q(uint16_t  *dst, uint16x8_t  vec) { vst1q_u16(dst, vec); }
+static inline void vst1q(int32_t   *dst, int32x4_t   vec) { vst1q_s32(dst, vec); }
+static inline void vst1q(uint32_t  *dst, uint32x4_t  vec) { vst1q_u32(dst, vec); }
+static inline void vst1q(int64_t   *dst, int64x2_t   vec) { vst1q_s64(dst, vec); }
+static inline void vst1q(uint64_t  *dst, uint64x2_t  vec) { vst1q_u64(dst, vec); }
+static inline void vst1q(float32_t *dst, float32x4_t vec) { vst1q_f32(dst, vec); }
+
+static inline void vst2q(int8_t    *dst, int8x16x2_t   vec) { vst2q_s8(dst, vec); }
+static inline void vst2q(uint8_t   *dst, uint8x16x2_t  vec) { vst2q_u8(dst, vec); }
+static inline void vst2q(int16_t   *dst, int16x8x2_t   vec) { vst2q_s16(dst, vec); }
+static inline void vst2q(uint16_t  *dst, uint16x8x2_t  vec) { vst2q_u16(dst, vec); }
+static inline void vst2q(int32_t   *dst, int32x4x2_t   vec) { vst2q_s32(dst, vec); }
+static inline void vst2q(uint32_t  *dst, uint32x4x2_t  vec) { vst2q_u32(dst, vec); }
+static inline void vst2q(int64_t   *dst, int64x2x2_t   vec) { vst2q_s64(dst, vec); }
+static inline void vst2q(uint64_t  *dst, uint64x2x2_t  vec) { vst2q_u64(dst, vec); }
+static inline void vst2q(float32_t *dst, float32x4x2_t vec) { vst2q_f32(dst, vec); }
+
+static inline void vst3q(int8_t    *dst, int8x16x3_t   vec) { vst3q_s8(dst, vec); }
+static inline void vst3q(uint8_t   *dst, uint8x16x3_t  vec) { vst3q_u8(dst, vec); }
+static inline void vst3q(int16_t   *dst, int16x8x3_t   vec) { vst3q_s16(dst, vec); }
+static inline void vst3q(uint16_t  *dst, uint16x8x3_t  vec) { vst3q_u16(dst, vec); }
+static inline void vst3q(int32_t   *dst, int32x4x3_t   vec) { vst3q_s32(dst, vec); }
+static inline void vst3q(uint32_t  *dst, uint32x4x3_t  vec) { vst3q_u32(dst, vec); }
+static inline void vst3q(int64_t   *dst, int64x2x3_t   vec) { vst3q_s64(dst, vec); }
+static inline void vst3q(uint64_t  *dst, uint64x2x3_t  vec) { vst3q_u64(dst, vec); }
+static inline void vst3q(float32_t *dst, float32x4x3_t vec) { vst3q_f32(dst, vec); }
+
+static inline void vst4q(int8_t    *dst, int8x16x4_t   vec) { vst4q_s8(dst, vec); }
+static inline void vst4q(uint8_t   *dst, uint8x16x4_t  vec) { vst4q_u8(dst, vec); }
+static inline void vst4q(int16_t   *dst, int16x8x4_t   vec) { vst4q_s16(dst, vec); }
+static inline void vst4q(uint16_t  *dst, uint16x8x4_t  vec) { vst4q_u16(dst, vec); }
+static inline void vst4q(int32_t   *dst, int32x4x4_t   vec) { vst4q_s32(dst, vec); }
+static inline void vst4q(uint32_t  *dst, uint32x4x4_t  vec) { vst4q_u32(dst, vec); }
+static inline void vst4q(int64_t   *dst, int64x2x4_t   vec) { vst4q_s64(dst, vec); }
+static inline void vst4q(uint64_t  *dst, uint64x2x4_t  vec) { vst4q_u64(dst, vec); }
+static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); }
+
+static inline void vst1q_x2(int8_t    *dst, int8x16x2_t   vec) { vst1q_s8_x2(dst, vec); }
+static inline void vst1q_x2(uint8_t   *dst, uint8x16x2_t  vec) { vst1q_u8_x2(dst, vec); }
+static inline void vst1q_x2(int16_t   *dst, int16x8x2_t   vec) { vst1q_s16_x2(dst, vec); }
+static inline void vst1q_x2(uint16_t  *dst, uint16x8x2_t  vec) { vst1q_u16_x2(dst, vec); }
+static inline void vst1q_x2(int32_t   *dst, int32x4x2_t   vec) { vst1q_s32_x2(dst, vec); }
+static inline void vst1q_x2(uint32_t  *dst, uint32x4x2_t  vec) { vst1q_u32_x2(dst, vec); }
+static inline void vst1q_x2(int64_t   *dst, int64x2x2_t   vec) { vst1q_s64_x2(dst, vec); }
+static inline void vst1q_x2(uint64_t  *dst, uint64x2x2_t  vec) { vst1q_u64_x2(dst, vec); }
+static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { vst1q_f32_x2(dst, vec); }
+
+static inline void vst1q_x4(int8_t    *dst, int8x16x4_t   vec) { vst1q_s8_x4(dst, vec); }
+static inline void vst1q_x4(uint8_t   *dst, uint8x16x4_t  vec) { vst1q_u8_x4(dst, vec); }
+static inline void vst1q_x4(int16_t   *dst, int16x8x4_t   vec) { vst1q_s16_x4(dst, vec); }
+static inline void vst1q_x4(uint16_t  *dst, uint16x8x4_t  vec) { vst1q_u16_x4(dst, vec); }
+static inline void vst1q_x4(int32_t   *dst, int32x4x4_t   vec) { vst1q_s32_x4(dst, vec); }
+static inline void vst1q_x4(uint32_t  *dst, uint32x4x4_t  vec) { vst1q_u32_x4(dst, vec); }
+static inline void vst1q_x4(int64_t   *dst, int64x2x4_t   vec) { vst1q_s64_x4(dst, vec); }
+static inline void vst1q_x4(uint64_t  *dst, uint64x2x4_t  vec) { vst1q_u64_x4(dst, vec); }
+static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { vst1q_f32_x4(dst, vec); }
 
 // -----------------------------------------------------------------------------
 // vreinterpret*
diff --git a/kleidicv/src/arithmetics/exp_api.cpp b/kleidicv/src/arithmetics/exp_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39c492719c72a09e4b958b5007f59e945c0821e2
--- /dev/null
+++ b/kleidicv/src/arithmetics/exp_api.cpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/dispatch.h"
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/types.h"
+
+namespace kleidicv {
+
+namespace neon {
+
+template <typename T>
+kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride,
+                     size_t width, size_t height);
+
+}  // namespace neon
+
+}  // namespace kleidicv
+
+#define KLEIDICV_DEFINE_C_API(name, type)                                \
+  KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::exp<type>, nullptr, \
+                              nullptr)
+
+KLEIDICV_DEFINE_C_API(kleidicv_exp_f32, float);
diff --git a/kleidicv/src/arithmetics/exp_neon.cpp b/kleidicv/src/arithmetics/exp_neon.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5414247934e4de4318206d607aaef104c0048e74
--- /dev/null
+++ b/kleidicv/src/arithmetics/exp_neon.cpp
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cmath>
+
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/neon.h"
+
+namespace kleidicv::neon {
+
+template <typename ScalarType>
+class Exp;
+
+template <>
+class Exp<float> final : public UnrollOnce {
+ public:
+  using VecTraits = neon::VecTraits<float>;
+  using VectorType = typename VecTraits::VectorType;
+
+  VectorType vector_path(VectorType src) {
+    float32x4_t n, r, scale, poly, absn, z;
+    uint32x4_t cmp, e;
+
+    /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+    z = vfmaq_f32(vdupq_n(kShift), src, vdupq_n(kInvLn2));
+    n = z - vdupq_n(kShift);
+    r = vfmaq_f32(src, n, vdupq_n(-kLn2Hi));
+    r = vfmaq_f32(r, n, vdupq_n(-kLn2Lo));
+    e = vreinterpretq_u32_f32(z) << 23;
+    scale = vreinterpretq_f32_u32(e + vdupq_n(0x3f800000));
+    absn = vabsq_f32(n);
+    cmp = absn > vdupq_n(126.0F);
+    poly = vfmaq_f32(vdupq_n(kPoly[1]), vdupq_n(kPoly[0]), r);
+    poly = vfmaq_f32(vdupq_n(kPoly[2]), poly, r);
+    poly = vfmaq_f32(vdupq_n(kPoly[3]), poly, r);
+    poly = vfmaq_f32(vdupq_n(kPoly[4]), poly, r);
+    poly = vfmaq_f32(vdupq_n(1.0F), poly, r);
+    poly = vfmaq_f32(vdupq_n(1.0F), poly, r);
+    if (KLEIDICV_UNLIKELY(v_any_u32(cmp))) {
+      return specialcase(poly, n, e, absn);
+    }
+    return scale * poly;
+  }
+
+  float scalar_path(float src) { return expf(src); }
+
+ private:
+  static int v_any_u32(uint32x4_t x) {
+    /* assume elements in x are either 0 or -1u.  */
+    return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0;
+  }
+
+  static float32x4_t specialcase(float32x4_t poly, float32x4_t n, uint32x4_t e,
+                                 float32x4_t absn) {
+    /* 2^n may overflow, break it up into s1*s2.  */
+    uint32x4_t b = (n <= vdupq_n(0.0F)) & vdupq_n(0x83000000);
+    float32x4_t s1 = vreinterpretq_f32_u32(vdupq_n(0x7f000000) + b);
+    float32x4_t s2 = vreinterpretq_f32_u32(e - b);
+    uint32x4_t cmp = absn > vdupq_n(192.0F);
+    float32x4_t r1 = s1 * s1;
+    float32x4_t r0 = poly * s1 * s2;
+    return vreinterpretq_f32_u32((cmp & vreinterpretq_u32_f32(r1)) |
+                                 (~cmp & vreinterpretq_u32_f32(r0)));
+  }
+
+  static constexpr float kShift = 0x1.8p23F;
+  static constexpr float kInvLn2 = 0x1.715476p+0F;
+  static constexpr float kLn2Hi = 0x1.62e4p-1F;
+  static constexpr float kLn2Lo = 0x1.7f7d1cp-20F;
+  static constexpr float kPoly[] = {
+      /*  maxerr: 0.36565 +0.5 ulp.  */
+      0x1.6a6000p-10F, 0x1.12718ep-7F, 0x1.555af0p-5F,
+      0x1.555430p-3F,  0x1.fffff4p-2F,
+  };
+};  // end of class Exp<float>
+
+template <typename T>
+kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride,
+                     size_t width, size_t height) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride);
+  CHECK_IMAGE_SIZE(width, height);
+
+  Exp<T> operation;
+  Rectangle rect{width, height};
+  Rows<const T> src_rows{src, src_stride};
+  Rows<T> dst_rows{dst, dst_stride};
+  apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  return KLEIDICV_OK;
+}
+
+#define KLEIDICV_INSTANTIATE_TEMPLATE(type)                             \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp<type>(         \
+      const type* src, size_t src_stride, type* dst, size_t dst_stride, \
+      size_t width, size_t height)
+
+KLEIDICV_INSTANTIATE_TEMPLATE(float);
+
+}  // namespace kleidicv::neon
diff --git a/scripts/ci-opencv.sh b/scripts/ci-opencv.sh
index a0982daffd444003b7924cef0c1009d1719f93f1..00ab6b07b78646ed827f49c1e0639bbdd868770a 100755
--- a/scripts/ci-opencv.sh
+++ b/scripts/ci-opencv.sh
@@ -54,6 +54,7 @@ CORE_TEST_PATTERNS=(
   '*Core_Transpose*'
   '*Core_MinMaxLoc*'
   '*Core_ConvertScale*'
+  '*Core_Exp*'
 )
 CORE_TEST_PATTERNS_STR="$(join_strings_with_colon "${CORE_TEST_PATTERNS[*]}")"
 ../../../conformity/opencv_kleidicv/bin/opencv_test_core \
diff --git a/test/api/test_exp.cpp b/test/api/test_exp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9347b21c10716c7e8729fbbab66ac6f92b42cb60
--- /dev/null
+++ b/test/api/test_exp.cpp
@@ -0,0 +1,239 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "framework/generator.h"
+#include "framework/operation.h"
+#include "framework/utils.h"
+#include "kleidicv/kleidicv.h"
+
+#define KLEIDICV_EXP(type, suffix) \
+  KLEIDICV_API(exp, kleidicv_exp_##suffix, type)
+
+KLEIDICV_EXP(float, f32);
+
+static void check_1ulp_error(test::Array2D<float>& expected_array,
+                             test::Array2D<float>& actual_array) {
+  for (size_t i = 0; i < expected_array.height(); ++i) {
+    for (size_t j = 0; j < expected_array.width(); ++j) {
+      // Seems like clang-tidy does not understand what the fill member function
+      // of test::Array2D does, so these exceptions are required
+      // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign)
+      float expected = *(expected_array.at(i, j));
+      // NOLINTEND(clang-analyzer-core.uninitialized.Assign)
+
+      float actual = *(actual_array.at(i, j));
+      // Error of 1 ULP means that actual is either same as expected, or
+      // the next float value in negative of positive direction
+      EXPECT_EQ(std::nextafterf(actual, expected), expected);
+    }
+  }
+}
+
+template <typename ElementType>
+class ExpTestSpecial;
+
+template <>
+class ExpTestSpecial<float> final : public UnaryOperationTest<float> {
+  using ElementType = float;
+  using Elements = typename UnaryOperationTest<ElementType>::Elements;
+
+ public:
+  ExpTestSpecial() : test_elements_(input_values().size()) {
+    auto inputs = input_values();
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      test_elements_[i].values[0] = inputs[i];
+      // Expected values calculated as doubles to have 'perfect' references.
+      // As the NEON implementation reuses the toolchain's expf implementation
+      // for the tail path, the test expects that the error for expf is also
+      // less than 1 ULP.
+      test_elements_[i].values[1] =
+          static_cast<float>(exp(static_cast<double>(inputs[i])));
+    }
+  }
+
+ private:
+  static const std::vector<ElementType>& input_values() {
+    static const std::vector<ElementType> kInputValues = {
+        -105.31, -100.07, -81.012, -47.66, -3.1088, -0.21,
+        0.7,     6.2,     39.7201, 86.11,  88.947};
+
+    return kInputValues;
+  }
+
+  kleidicv_error_t call_api() override {
+    return exp<ElementType>()(
+        this->inputs_[0].data(), this->inputs_[0].stride(),
+        this->actual_[0].data(), this->actual_[0].stride(), this->width(),
+        this->height());
+  }
+
+  void check(kleidicv_error_t err) override {
+    EXPECT_EQ(KLEIDICV_OK, err);
+    check_1ulp_error(this->expected_[0], this->actual_[0]);
+  }
+
+  const std::vector<Elements>& test_elements() override {
+    return test_elements_;
+  }
+
+  void setup() override {
+    // Default input value is 0.0, so the default expected value needs to be set
+    // to 1.0
+    ElementType expected = 1.0;
+    this->expected_[0].fill(expected);
+    UnaryOperationTest<ElementType>::setup();
+  }
+
+  std::vector<Elements> test_elements_;
+};  // end of class ExpTestSpecial<float>
+
+template <typename ElementType>
+class ExpTestCustomBase;
+
+template <>
+class ExpTestCustomBase<float> {
+ protected:
+  static void fill_expected(test::Array2D<float>& input,
+                            test::Array2D<float>& expected) {
+    for (size_t i = 0; i < input.height(); ++i) {
+      for (size_t j = 0; j < input.width(); ++j) {
+        // Expected values calculated as doubles to have 'perfect' references.
+        // As the NEON implementation reuses the toolchain's expf implementation
+        // for the tail path, the test expects that the error for expf is also
+        // less than 1 ULP.
+
+        // Seems like clang-tidy does not understand what the fill member
+        // function of test::Array2D does, so these exceptions are required
+        // NOLINTBEGIN(clang-analyzer-core.CallAndMessage)
+        *(expected.at(i, j)) =
+            static_cast<float>(exp(static_cast<double>(*(input.at(i, j)))));
+        // NOLINTEND(clang-analyzer-core.CallAndMessage)
+      }
+    }
+  }
+};  // end of class ExpTestCustomBase<float>
+
+template <typename ElementType>
+class ExpTestRandom;
+
+template <>
+class ExpTestRandom<float> final : public ExpTestCustomBase<float> {
+ public:
+  void test() {
+    const size_t kWidth = test::Options::vector_length() * 16;
+    constexpr size_t kHeight = 16;
+    test::PseudoRandomNumberGenerator<float> generator;
+    test::Array2D<float> input{kWidth, kHeight};
+    test::Array2D<float> expected{kWidth, kHeight};
+    test::Array2D<float> actual{kWidth, kHeight};
+    input.fill(generator);
+
+    fill_expected(input, expected);
+
+    EXPECT_EQ(KLEIDICV_OK,
+              exp<float>()(input.data(), input.stride(), actual.data(),
+                           actual.stride(), input.width(), input.height()));
+
+    check_1ulp_error(expected, actual);
+  }
+};  // end of class ExpTestRandom<float>
+
+template <typename ElementType>
+class ExpTestAll;
+
+template <>
+class ExpTestAll<float> final : public ExpTestCustomBase<float> {
+ public:
+  void test() {
+    constexpr size_t kWidth = 1024;
+    constexpr size_t kHeight = 1024;
+    // Sweeping through a meaningful input range. The start value results in
+    // 0.0, the last value in inf.
+    LinearFloatGenerator generator{-104, 89};
+    test::Array2D<float> input{kWidth, kHeight};
+    test::Array2D<float> expected{kWidth, kHeight};
+    test::Array2D<float> actual{kWidth, kHeight};
+
+    while (generator.last_value_not_reached()) {
+      input.fill(generator);
+
+      fill_expected(input, expected);
+
+      EXPECT_EQ(KLEIDICV_OK,
+                exp<float>()(input.data(), input.stride(), actual.data(),
+                             actual.stride(), input.width(), input.height()));
+
+      check_1ulp_error(expected, actual);
+    }
+  }
+
+ private:
+  class LinearFloatGenerator : public test::Generator<float> {
+   public:
+    LinearFloatGenerator(float start_value, float last_value)
+        : start_value_{start_value},
+          last_value_{last_value},
+          value_{start_value} {}
+
+    void reset() override { value_ = start_value_; }
+
+    std::optional<float> next() override {
+      float current_value = value_;
+      value_ = std::nextafterf(value_, last_value_);
+      return current_value;
+    }
+
+    bool last_value_not_reached() const {
+      return std::nextafterf(value_, last_value_) != value_;
+    }
+
+   private:
+    float start_value_;
+    float last_value_;
+    float value_;
+  };  // end of class LinearFloatGenerator
+};    // end of class ExpTestAll<float>
+
+template <typename ElementType>
+class Exp : public testing::Test {};
+using ElementTypes = ::testing::Types<float>;
+TYPED_TEST_SUITE(Exp, ElementTypes);
+
+TYPED_TEST(Exp, SpecialValues) {
+  ExpTestSpecial<TypeParam>{}.test();
+  ExpTestSpecial<TypeParam>{}
+      .with_padding(test::Options::vector_length())
+      .test();
+}
+
+TYPED_TEST(Exp, RandomValues) { ExpTestRandom<TypeParam>{}.test(); }
+
+TYPED_TEST(Exp, AllValues) {
+  if (test::Options::are_long_running_tests_skipped()) {
+    GTEST_SKIP() << "Long running exp test skipped";
+  }
+  ExpTestAll<TypeParam>{}.test();
+}
+
+TYPED_TEST(Exp, OversizeImage) {
+  TypeParam src[1] = {}, dst[1];
+  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
+            exp<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
+                             KLEIDICV_MAX_IMAGE_PIXELS + 1, 1));
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      exp<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
+                       KLEIDICV_MAX_IMAGE_PIXELS, KLEIDICV_MAX_IMAGE_PIXELS));
+}
+
+TYPED_TEST(Exp, NullPointers) {
+  TypeParam src[1] = {}, dst[1];
+  test::test_null_args(exp<TypeParam>(), src, sizeof(TypeParam), dst,
+                       sizeof(TypeParam), 1, 1);
+}
diff --git a/test/framework/test_main.cpp b/test/framework/test_main.cpp
index a4d962e0204713273e927aa302115b08010114c7..4c71daa450f8719342deb2314bb388b98ce118e3 100644
--- a/test/framework/test_main.cpp
+++ b/test/framework/test_main.cpp
@@ -22,6 +22,7 @@ static void parse_arguments(int argc, char **argv) {
   static struct option long_options[] = {
     {"vector-length", required_argument, nullptr, 'v'},
     {"seed", required_argument, nullptr, 's'},
+    {"long-running-tests", no_argument, nullptr, 'l'},
     {nullptr, 0, nullptr, 0}
   };
   // clang-format on
@@ -48,6 +49,10 @@ static void parse_arguments(int argc, char **argv) {
         Options::set_seed(std::stoull(optarg));
         is_seed_set = true;
         break;
+
+      case 'l':
+        Options::turn_on_long_running_tests();
+        break;
     }
   }
 
diff --git a/test/framework/utils.cpp b/test/framework/utils.cpp
index a88f42a11ed13c39acc7670c56a47cdf88a2ed1e..bd090d73a423309564dd129c3e220c1ab22b204e 100644
--- a/test/framework/utils.cpp
+++ b/test/framework/utils.cpp
@@ -16,6 +16,8 @@
 
 bool MockMallocToFail::enabled = false;
 
+bool test::Options::are_long_running_tests_skipped_ = true;
+
 namespace test {
 
 template <typename ElementType>
diff --git a/test/framework/utils.h b/test/framework/utils.h
index 9ee2ea6210022b031f1ed55996d5537d5fe5526c..e5f0e46b06538be30653cf26c181e3babcb8393b 100644
--- a/test/framework/utils.h
+++ b/test/framework/utils.h
@@ -67,6 +67,11 @@ class Options {
   // Returns seed to use.
   static uint64_t seed() { return seed_; }
 
+  // Whether long running tests should be skipped.
+  static bool are_long_running_tests_skipped() {
+    return are_long_running_tests_skipped_;
+  }
+
   // Returns the number of lanes in a vector for a given arithmetic type.
   template <typename ElementType,
             std::enable_if_t<std::is_arithmetic_v<ElementType>, bool> = true>
@@ -89,11 +94,18 @@ class Options {
   // Sets the seed.
   static void set_seed(uint64_t value) { seed_ = value; }
 
+  // Turns on long running tests.
+  static void turn_on_long_running_tests() {
+    are_long_running_tests_skipped_ = false;
+  }
+
  private:
   // Vector length being tested.
   static size_t vector_length_;
   // Seed to use.
   static uint64_t seed_;
+  // Whether long running tests should be skipped.
+  static bool are_long_running_tests_skipped_;
 };  // end of class Options
 
 // Prints all the elements in a two-dimensional space.