From f6eed446f003f1e0b83b5f18161fc68f32301a32 Mon Sep 17 00:00:00 2001
From: Ioana Ghiban <ioana.ghiban@arm.com>
Date: Wed, 28 Aug 2024 14:27:12 +0200
Subject: [PATCH] Implement float Separable Filter 2D with NEON

---
 adapters/opencv/kleidicv_hal.cpp              |  14 +-
 benchmark/benchmark.cpp                       |  10 ++
 .../opencv/test_separable_filter_2d.cpp       |  36 ++++-
 conformity/opencv/utils.h                     |   2 +
 doc/opencv.md                                 |   2 +-
 .../kleidicv/filters/separable_filter_2d.h    |  18 +++
 kleidicv/include/kleidicv/kleidicv.h          |   6 +
 .../src/filters/separable_filter_2d_api.cpp   |  14 ++
 .../src/filters/separable_filter_2d_neon.cpp  | 140 ++++++++++++------
 .../include/kleidicv_thread/kleidicv_thread.h |   7 +
 kleidicv_thread/src/kleidicv_thread.cpp       |  17 +++
 test/api/test_float_conv.cpp                  |  17 +--
 test/api/test_in_range.cpp                    |  17 +--
 test/api/test_separable_filter_2d.cpp         | 140 +++++++++++++++++-
 test/api/test_thread.cpp                      |   5 +
 test/framework/array.h                        |  55 +++++++
 test/framework/border.cpp                     |   4 +
 test/framework/generator.h                    |  18 +++
 test/framework/kernel.h                       |  32 +++-
 test/framework/utils.cpp                      |   7 +
 test/framework/utils.h                        |   7 +
 21 files changed, 482 insertions(+), 86 deletions(-)
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 14afc0e3b..712371740 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -47,6 +47,7 @@ static size_t get_type_size(int depth) {
     case CV_16S:
       return 2;
     case CV_32S:
+    case CV_32F:
       return 4;
     default:
       return SIZE_MAX;
@@ -327,7 +328,8 @@ int separable_filter_2d_init(cvhalFilter2D **context, int src_type,
   }
 
   int operation_depth = CV_MAT_DEPTH(src_type);
-  if (operation_depth != CV_8U && operation_depth != CV_16U) {
+  if (operation_depth != CV_8U && operation_depth != CV_16U &&
+      operation_depth != CV_32F) {
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
 
@@ -459,6 +461,16 @@ int separable_filter_2d_operation(cvhalFilter2D *context, uchar *src_data,
           reinterpret_cast<const uint16_t *>(params->kernel_y),
           params->kernel_height, params->border_type, filter_context, mt);
       break;
+    case CV_32F:
+      filter_err = kleidicv_thread_separable_filter_2d_f32(
+          reinterpret_cast<const float *>(src_data), src_step,
+          reinterpret_cast<float *>(dst_data), dst_step,
+          static_cast<size_t>(width), static_cast<size_t>(height),
+          params->channels, reinterpret_cast<const float *>(params->kernel_x),
+          params->kernel_width,
+          reinterpret_cast<const float *>(params->kernel_y),
+          params->kernel_height, params->border_type, filter_context, mt);
+      break;
     default:
       return CV_HAL_ERROR_NOT_IMPLEMENTED;
   }
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 06839d728..37bfa1a29 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -290,6 +290,16 @@ static void separable_filter_2d_u16_5x5_3ch(benchmark::State& state) {
 }
 BENCHMARK(separable_filter_2d_u16_5x5_3ch);
 
+static void separable_filter_2d_f32_5x5_1ch(benchmark::State& state) {
+  separable_filter_2d<float, 5, 1>(state, kleidicv_separable_filter_2d_f32);
+}
+BENCHMARK(separable_filter_2d_f32_5x5_1ch);
+
+static void separable_filter_2d_f32_5x5_3ch(benchmark::State& state) {
+  separable_filter_2d<float, 5, 3>(state, kleidicv_separable_filter_2d_f32);
+}
+BENCHMARK(separable_filter_2d_f32_5x5_3ch);
+
 template <typename T, size_t KernelSize, int Channels, bool Binomial>
 static void gaussian_blur(benchmark::State& state) {
   kleidicv_filter_context_t* context;
diff --git a/conformity/opencv/test_separable_filter_2d.cpp b/conformity/opencv/test_separable_filter_2d.cpp
index 893493692..d5cdce213 100644
--- a/conformity/opencv/test_separable_filter_2d.cpp
+++ b/conformity/opencv/test_separable_filter_2d.cpp
@@ -92,17 +92,20 @@ bool test_separable_filter_2d(int index, RecreatedMessageQueue& request_queue,
       // One extra line allocated to be sure the kernel seed can be placed next
       // to the real input
       cv::Mat input(y + 1, x, get_opencv_matrix_type<TypeParam, Channels>());
-      // use the minimum value 1 for the input in order to properly work around
-      // the potential OpenCV bug (mentioned lower)
-      rng.fill(input, cv::RNG::UNIFORM, 1,
-               std::numeric_limits<TypeParam>::max());
 
-      uint32_t kernel_seed = rng.next();
+      if constexpr (std::is_same_v<TypeParam, float>) {
+        rng.fill(input, cv::RNG::NORMAL, 0.0, 1.0e10);
+      } else {
+        // use the minimum value 1 for the input in order to properly work
+        // around the potential OpenCV bug (mentioned lower)
+        rng.fill(input, cv::RNG::UNIFORM, 1,
+                 std::numeric_limits<TypeParam>::max());
+      }
 
       // kernel seed is embedded into the input matrix
+      uint32_t kernel_seed = rng.next();
       *reinterpret_cast<uint32_t*>(&input.at<TypeParam>(input.rows - 1, 0)) =
           kernel_seed;
-
       cv::Mat actual =
           exec_separable_filter_2d<TypeParam, KernelSize, BorderType>(input);
       cv::Mat expected = get_expected_from_subordinate(index, request_queue,
@@ -118,7 +121,11 @@ bool test_separable_filter_2d(int index, RecreatedMessageQueue& request_queue,
         }
       }
 
-      if (are_matrices_different<TypeParam>(0, actual, expected)) {
+      bool success =
+          (std::is_same_v<TypeParam, float> &&
+           !are_float_matrices_different<float>(0.3, actual, expected)) ||
+          (!are_matrices_different<TypeParam>(0, actual, expected));
+      if (!success) {
         fail_print_matrices(y, x, input, actual, expected);
         return true;
       }
@@ -166,6 +173,21 @@ std::vector<test>& separable_filter_2d_tests_get() {
     TEST("Separable Filter 2D 5x5 (u16), BORDER_REPLICATE, 2 channel", (test_separable_filter_2d<uint16_t, 5, cv::BORDER_REPLICATE, 2>), (exec_separable_filter_2d<uint16_t, 5, cv::BORDER_REPLICATE>)),
     TEST("Separable Filter 2D 5x5 (u16), BORDER_REPLICATE, 3 channel", (test_separable_filter_2d<uint16_t, 5, cv::BORDER_REPLICATE, 3>), (exec_separable_filter_2d<uint16_t, 5, cv::BORDER_REPLICATE>)),
     TEST("Separable Filter 2D 5x5 (u16), BORDER_REPLICATE, 4 channel", (test_separable_filter_2d<uint16_t, 5, cv::BORDER_REPLICATE, 4>), (exec_separable_filter_2d<uint16_t, 5, cv::BORDER_REPLICATE>)),
+
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT_101, 1 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101, 1>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT_101, 2 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101, 2>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT_101, 3 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101, 3>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT_101, 4 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101, 4>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT_101>)),
+
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT, 1 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT, 1>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT, 2 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT, 2>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT, 3 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT, 3>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REFLECT, 4 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REFLECT, 4>), (exec_separable_filter_2d<float, 5, cv::BORDER_REFLECT>)),
+
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REPLICATE, 1 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REPLICATE, 1>), (exec_separable_filter_2d<float, 5, cv::BORDER_REPLICATE>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REPLICATE, 2 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REPLICATE, 2>), (exec_separable_filter_2d<float, 5, cv::BORDER_REPLICATE>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REPLICATE, 3 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REPLICATE, 3>), (exec_separable_filter_2d<float, 5, cv::BORDER_REPLICATE>)),
+    TEST("Separable Filter 2D 5x5 (f32), BORDER_REPLICATE, 4 channel", (test_separable_filter_2d<float, 5, cv::BORDER_REPLICATE, 4>), (exec_separable_filter_2d<float, 5, cv::BORDER_REPLICATE>)),
   };
   // clang-format on
   return tests;
diff --git a/conformity/opencv/utils.h b/conformity/opencv/utils.h
index 321a7e590..89e884764 100644
--- a/conformity/opencv/utils.h
+++ b/conformity/opencv/utils.h
@@ -17,6 +17,8 @@ constexpr int get_opencv_matrix_type() {
     return CV_8UC(Channels);
   } else if constexpr (std::is_same_v<T, uint16_t>) {
     return CV_16UC(Channels);
+  } else if constexpr (std::is_same_v<T, float>) {
+    return CV_32FC(Channels);
   }
 }
 
diff --git a/doc/opencv.md b/doc/opencv.md
index 431a8e21a..7372f30f4 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -99,7 +99,7 @@ Notes on parameters:
 
 ### [`cv::sepFilter2D()`](https://docs.opencv.org/4.10.0/d4/d86/group__imgproc__filter.html#ga910e29ff7d7b105057d1625a4bf6318d)
 Applies a separable linear filter to an image.\
-Currently only the 5x5 kernel size is supported with `CV_8U` and `CV_16U` source, destination and kernel depths.\
+Currently only the 5x5 kernel size is supported with `CV_8U`, `CV_16U` and `CV_32F` source, destination and kernel depths.\
 In-place filtering not supported.
 
 Notes on parameters:
diff --git a/kleidicv/include/kleidicv/filters/separable_filter_2d.h b/kleidicv/include/kleidicv/filters/separable_filter_2d.h
index cad5539f9..b633c4382 100644
--- a/kleidicv/include/kleidicv/filters/separable_filter_2d.h
+++ b/kleidicv/include/kleidicv/filters/separable_filter_2d.h
@@ -32,6 +32,17 @@ KLEIDICV_API_DECLARATION(kleidicv_separable_filter_2d_stripe_u16,
                          const uint16_t *kernel_y, size_t kernel_height,
                          kleidicv_border_type_t border_type,
                          kleidicv_filter_context_t *context);
+// For internal use only. See instead kleidicv_separable_filter_2d_f32.
+// Filter a horizontal stripe across an image. The stripe is defined by the
+// range (y_begin, y_end].
+KLEIDICV_API_DECLARATION(kleidicv_separable_filter_2d_stripe_f32,
+                         const float *src, size_t src_stride, float *dst,
+                         size_t dst_stride, size_t width, size_t height,
+                         size_t y_begin, size_t y_end, size_t channels,
+                         const float *kernel_x, size_t kernel_width,
+                         const float *kernel_y, size_t kernel_height,
+                         kleidicv_border_type_t border_type,
+                         kleidicv_filter_context_t *context);
 }
 
 namespace kleidicv {
@@ -52,6 +63,13 @@ kleidicv_error_t separable_filter_2d_stripe_u16(
     size_t kernel_height, kleidicv_border_type_t border_type,
     kleidicv_filter_context_t *context);
 
+kleidicv_error_t separable_filter_2d_stripe_f32(
+    const float *src, size_t src_stride, float *dst, size_t dst_stride,
+    size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
+    const float *kernel_x, size_t kernel_width, const float *kernel_y,
+    size_t kernel_height, kleidicv_border_type_t border_type,
+    kleidicv_filter_context_t *context);
+
 }  // namespace neon
 
 namespace sve2 {
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index e18175929..ba64a50ee 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1277,6 +1277,12 @@ kleidicv_error_t kleidicv_separable_filter_2d_u16(
     size_t width, size_t height, size_t channels, const uint16_t *kernel_x,
     size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height,
     kleidicv_border_type_t border_type, kleidicv_filter_context_t *context);
+/// @copydoc kleidicv_separable_filter_2d_u8
+kleidicv_error_t kleidicv_separable_filter_2d_f32(
+    const float *src, size_t src_stride, float *dst, size_t dst_stride,
+    size_t width, size_t height, size_t channels, const float *kernel_x,
+    size_t kernel_width, const float *kernel_y, size_t kernel_height,
+    kleidicv_border_type_t border_type, kleidicv_filter_context_t *context);
 
 /// Applies Gaussian blur to the source image using the specified parameters.
 /// In-place filtering is not supported.
diff --git a/kleidicv/src/filters/separable_filter_2d_api.cpp b/kleidicv/src/filters/separable_filter_2d_api.cpp
index e4adf6cea..371c41147 100644
--- a/kleidicv/src/filters/separable_filter_2d_api.cpp
+++ b/kleidicv/src/filters/separable_filter_2d_api.cpp
@@ -19,6 +19,10 @@ KLEIDICV_MULTIVERSION_C_API(
     KLEIDICV_SVE2_IMPL_IF(kleidicv::sve2::separable_filter_2d_stripe_u16),
     &kleidicv::sme2::separable_filter_2d_stripe_u16);
 
+KLEIDICV_MULTIVERSION_C_API(kleidicv_separable_filter_2d_stripe_f32,
+                            &kleidicv::neon::separable_filter_2d_stripe_f32,
+                            nullptr, nullptr);
+
 extern "C" {
 
 using KLEIDICV_TARGET_NAMESPACE::Rectangle;
@@ -88,4 +92,14 @@ kleidicv_error_t kleidicv_separable_filter_2d_u16(
       kernel_x, kernel_width, kernel_y, kernel_height, border_type, context);
 }
 
+kleidicv_error_t kleidicv_separable_filter_2d_f32(
+    const float *src, size_t src_stride, float *dst, size_t dst_stride,
+    size_t width, size_t height, size_t channels, const float *kernel_x,
+    size_t kernel_width, const float *kernel_y, size_t kernel_height,
+    kleidicv_border_type_t border_type, kleidicv_filter_context_t *context) {
+  return kleidicv_separable_filter_2d_stripe_f32(
+      src, src_stride, dst, dst_stride, width, height, 0, height, channels,
+      kernel_x, kernel_width, kernel_y, kernel_height, border_type, context);
+}
+
 }  // extern "C"
diff --git a/kleidicv/src/filters/separable_filter_2d_neon.cpp b/kleidicv/src/filters/separable_filter_2d_neon.cpp
index da1009549..aadca93e2 100644
--- a/kleidicv/src/filters/separable_filter_2d_neon.cpp
+++ b/kleidicv/src/filters/separable_filter_2d_neon.cpp
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <cmath>
 #include <limits>
 
 #include "kleidicv/ctypes.h"
@@ -219,6 +220,79 @@ class SeparableFilter2D<uint16_t, 5> {
   SourceVectorType kernel_y_u16_[5];
 };  // end of class SeparableFilter2D<uint16_t, 5>
 
+template <>
+class SeparableFilter2D<float, 5> {
+ public:
+  using SourceType = float;
+  using SourceVectorType = typename VecTraits<SourceType>::VectorType;
+  using BufferType = float;
+  using BufferVectorType = typename VecTraits<BufferType>::VectorType;
+  using DestinationType = float;
+
+  // Ignored because vectors are initialized in the constructor body.
+  // NOLINTNEXTLINE - hicpp-member-init
+  SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y)
+      : kernel_x_(kernel_x), kernel_y_(kernel_y) {
+    for (size_t i = 0; i < 5; i++) {
+      kernel_x_f32_[i] = vdupq_n_f32(kernel_x[i]);
+      kernel_y_f32_[i] = vdupq_n_f32(kernel_y[i]);
+    }
+  }
+
+  void vertical_vector_path(SourceVectorType src[5],
+                            DestinationType *dst) const {
+    auto acc = vmulq_f32(src[0], kernel_y_f32_[0]);
+
+    // Optimization to avoid unnecessary branching in vector code.
+    KLEIDICV_FORCE_LOOP_UNROLL
+    for (size_t i = 1; i < 5; i++) {
+      acc = vmlaq_f32(acc, src[i], kernel_y_f32_[i]);
+    }
+
+    vst1q_f32(&dst[0], acc);
+  }
+
+  void vertical_scalar_path(const SourceType src[5],
+                            DestinationType *dst) const {
+    SourceType acc = src[0] * kernel_y_[0];
+    for (size_t i = 1; i < 5; i++) {
+      acc += src[i] * kernel_y_[i];
+    }
+
+    dst[0] = acc;
+  }
+
+  void horizontal_vector_path(SourceVectorType src[5],
+                              DestinationType *dst) const {
+    SourceVectorType acc = vmulq_f32(src[0], kernel_x_f32_[0]);
+
+    // Optimization to avoid unnecessary branching in vector code.
+    KLEIDICV_FORCE_LOOP_UNROLL
+    for (size_t i = 1; i < 5; i++) {
+      acc = vmlaq_f32(acc, src[i], kernel_x_f32_[i]);
+    }
+    vst1q_f32(&dst[0], acc);
+  }
+
+  void horizontal_scalar_path(const SourceType src[5],
+                              DestinationType *dst) const {
+    SourceType acc = src[0] * kernel_x_[0];
+
+    for (size_t i = 1; i < 5; i++) {
+      acc += src[i] * kernel_x_[i];
+    }
+
+    dst[0] = acc;
+  }
+
+ private:
+  const SourceType *kernel_x_;
+  const SourceType *kernel_y_;
+
+  SourceVectorType kernel_x_f32_[5];
+  SourceVectorType kernel_y_f32_[5];
+};  // end of class SeparableFilter2D<float, 5>
+
 template <typename T>
 static kleidicv_error_t separable_filter_2d_checks(
     const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width,
@@ -255,11 +329,11 @@ static kleidicv_error_t separable_filter_2d_checks(
   return KLEIDICV_OK;
 }
 
-KLEIDICV_TARGET_FN_ATTRS
-kleidicv_error_t separable_filter_2d_stripe_u8(
-    const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
-    size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
-    const uint8_t *kernel_x, size_t kernel_width, const uint8_t *kernel_y,
+template <typename T>
+kleidicv_error_t separable_filter_2d_stripe_entry(
+    const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width,
+    size_t height, size_t y_begin, size_t y_end, size_t channels,
+    const T *kernel_x, size_t kernel_width, const T *kernel_y,
     size_t kernel_height, kleidicv_border_type_t border_type,
     kleidicv_filter_context_t *context) {
   auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
@@ -280,55 +354,35 @@ kleidicv_error_t separable_filter_2d_stripe_u8(
 
   Rectangle rect{width, height};
 
-  using SeparableFilterClass = SeparableFilter2D<uint8_t, 5>;
+  using SeparableFilterClass = SeparableFilter2D<T, 5>;
 
   SeparableFilterClass filterClass{kernel_x, kernel_y};
   SeparableFilter<SeparableFilterClass, 5> filter{filterClass};
 
-  Rows<const uint8_t> src_rows{src, src_stride, channels};
-  Rows<uint8_t> dst_rows{dst, dst_stride, channels};
+  Rows<const T> src_rows{src, src_stride, channels};
+  Rows<T> dst_rows{dst, dst_stride, channels};
   workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
                      *fixed_border_type, filter);
 
   return KLEIDICV_OK;
 }
 
-KLEIDICV_TARGET_FN_ATTRS
-kleidicv_error_t separable_filter_2d_stripe_u16(
-    const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
-    size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
-    const uint16_t *kernel_x, size_t kernel_width, const uint16_t *kernel_y,
-    size_t kernel_height, kleidicv_border_type_t border_type,
-    kleidicv_filter_context_t *context) {
-  auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
-  kleidicv_error_t checks_result = separable_filter_2d_checks(
-      src, src_stride, dst, dst_stride, width, height, channels, kernel_x,
-      kernel_width, kernel_y, kernel_height, workspace);
-
-  if (checks_result != KLEIDICV_OK) {
-    return checks_result;
-  }
-
-  auto fixed_border_type = get_fixed_border_type(border_type);
-  // if the std::optional is empty, that means that the border type is not
-  // supported, so there's no need to check for specific types
-  if (!fixed_border_type) {
-    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+#define KLEIDICV_INSTANTIATE_WRAPPER(type, type_suffix)                  \
+  kleidicv_error_t separable_filter_2d_stripe_##type_suffix(             \
+      const type *src, size_t src_stride, type *dst, size_t dst_stride,  \
+      size_t width, size_t height, size_t y_begin, size_t y_end,         \
+      size_t channels, const type *kernel_x, size_t kernel_width,        \
+      const type *kernel_y, size_t kernel_height,                        \
+      kleidicv_border_type_t border_type,                                \
+      kleidicv_filter_context_t *context) {                              \
+    return separable_filter_2d_stripe_entry(                             \
+        src, src_stride, dst, dst_stride, width, height, y_begin, y_end, \
+        channels, kernel_x, kernel_width, kernel_y, kernel_height,       \
+        border_type, context);                                           \
   }
 
-  Rectangle rect{width, height};
-
-  using SeparableFilterClass = SeparableFilter2D<uint16_t, 5>;
-
-  SeparableFilterClass filterClass{kernel_x, kernel_y};
-  SeparableFilter<SeparableFilterClass, 5> filter{filterClass};
-
-  Rows<const uint16_t> src_rows{src, src_stride, channels};
-  Rows<uint16_t> dst_rows{dst, dst_stride, channels};
-  workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
-                     *fixed_border_type, filter);
-
-  return KLEIDICV_OK;
-}
+KLEIDICV_INSTANTIATE_WRAPPER(uint8_t, u8);
+KLEIDICV_INSTANTIATE_WRAPPER(uint16_t, u16);
+KLEIDICV_INSTANTIATE_WRAPPER(float, f32);
 
 }  // namespace kleidicv::neon
diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
index 0267fa272..f4d2f0bcb 100644
--- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
+++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h
@@ -259,6 +259,13 @@ kleidicv_error_t kleidicv_thread_separable_filter_2d_u16(
     kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
     kleidicv_thread_multithreading);
 
+kleidicv_error_t kleidicv_thread_separable_filter_2d_f32(
+    const float *src, size_t src_stride, float *dst, size_t dst_stride,
+    size_t width, size_t height, size_t channels, const float *kernel_x,
+    size_t kernel_width, const float *kernel_y, size_t kernel_height,
+    kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
+    kleidicv_thread_multithreading);
+
 kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8(
     const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
     size_t width, size_t height, size_t channels,
diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp
index e01c06028..021f6cec9 100644
--- a/kleidicv_thread/src/kleidicv_thread.cpp
+++ b/kleidicv_thread/src/kleidicv_thread.cpp
@@ -460,6 +460,23 @@ kleidicv_error_t kleidicv_thread_separable_filter_2d_u16(
                                 kernel_height, context, mt);
 }
 
+kleidicv_error_t kleidicv_thread_separable_filter_2d_f32(
+    const float *src, size_t src_stride, float *dst, size_t dst_stride,
+    size_t width, size_t height, size_t channels, const float *kernel_x,
+    size_t kernel_width, const float *kernel_y, size_t kernel_height,
+    kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
+    kleidicv_thread_multithreading mt) {
+  auto callback = [=](size_t y_begin, size_t y_end,
+                      kleidicv_filter_context_t *thread_context) {
+    return kleidicv_separable_filter_2d_stripe_f32(
+        src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
+        channels, kernel_x, kernel_width, kernel_y, kernel_height, border_type,
+        thread_context);
+  };
+  return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
+                                kernel_height, context, mt);
+}
+
 kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8(
     const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
     size_t width, size_t height, size_t channels,
diff --git a/test/api/test_float_conv.cpp b/test/api/test_float_conv.cpp
index 5f3b0dda3..a3a861843 100644
--- a/test/api/test_float_conv.cpp
+++ b/test/api/test_float_conv.cpp
@@ -55,32 +55,25 @@ class FloatConversionTest final {
     OutputType expected;
   };
 
-  static float floatval(uint32_t v) {
-    float result;  // Avoid cppcoreguidelines-init-variables. NOLINT
-    static_assert(sizeof(result) == sizeof(v));
-    memcpy(&result, &v, sizeof(result));
-    return result;
-  }
-
   const float quietNaN = std::numeric_limits<float>::quiet_NaN();
   const float signalingNaN = std::numeric_limits<float>::signaling_NaN();
   const float posInfinity = std::numeric_limits<float>::infinity();
   const float negInfinity = -std::numeric_limits<float>::infinity();
 
-  const float minusNaN = floatval(0xFF800001);
-  const float plusNaN = floatval(0x7F800001);
+  const float minusNaN = test::floatval(0xFF800001);
+  const float plusNaN = test::floatval(0x7F800001);
   const float plusZero = 0.0F;
   const float minusZero = -0.0F;
 
-  const float oneNaN = floatval(0x7FC00001);
+  const float oneNaN = test::floatval(0x7FC00001);
   const float zeroDivZero = -std::numeric_limits<float>::quiet_NaN();
   const float floatMin = std::numeric_limits<float>::min();
   const float floatMax = std::numeric_limits<float>::max();
 
   const float posSubnormalMin = std::numeric_limits<float>::denorm_min();
-  const float posSubnormalMax = floatval(0x007FFFFF);
+  const float posSubnormalMax = test::floatval(0x007FFFFF);
   const float negSubnormalMin = -std::numeric_limits<float>::denorm_min();
-  const float negSubnormalMax = floatval(0x807FFFFF);
+  const float negSubnormalMax = test::floatval(0x807FFFFF);
 
   template <typename I, typename O,
             std::enable_if_t<std::is_same_v<float, I>, bool> = true,
diff --git a/test/api/test_in_range.cpp b/test/api/test_in_range.cpp
index b22ee5cd9..4e40daa43 100644
--- a/test/api/test_in_range.cpp
+++ b/test/api/test_in_range.cpp
@@ -48,32 +48,25 @@ class InRangeTest final {
           expected_rows(std::move(_expected_rows)) {}
   };
 
-  static float floatval(uint32_t v) {
-    float result;  // Avoid cppcoreguidelines-init-variables. NOLINT
-    static_assert(sizeof(result) == sizeof(v));
-    memcpy(&result, &v, sizeof(result));
-    return result;
-  }
-
   const float quietNaN = std::numeric_limits<float>::quiet_NaN();
   const float signalingNaN = std::numeric_limits<float>::signaling_NaN();
   const float posInfinity = std::numeric_limits<float>::infinity();
   const float negInfinity = -std::numeric_limits<float>::infinity();
 
-  const float minusNaN = floatval(0xFF800001);
-  const float plusNaN = floatval(0x7F800001);
+  const float minusNaN = test::floatval(0xFF800001);
+  const float plusNaN = test::floatval(0x7F800001);
   const float plusZero = 0.0F;
   const float minusZero = -0.0F;
 
-  const float oneNaN = floatval(0x7FC00001);
+  const float oneNaN = test::floatval(0x7FC00001);
   const float zeroDivZero = -std::numeric_limits<float>::quiet_NaN();
   const float floatMin = std::numeric_limits<float>::min();
   const float floatMax = std::numeric_limits<float>::max();
 
   const float posSubnormalMin = std::numeric_limits<float>::denorm_min();
-  const float posSubnormalMax = floatval(0x007FFFFF);
+  const float posSubnormalMax = test::floatval(0x007FFFFF);
   const float negSubnormalMin = -std::numeric_limits<float>::denorm_min();
-  const float negSubnormalMax = floatval(0x807FFFFF);
+  const float negSubnormalMax = test::floatval(0x807FFFFF);
 
   void calculate_expected(const test::Array2D<ElementType>& source,
                           test::Array2D<uint8_t>& expected,
diff --git a/test/api/test_separable_filter_2d.cpp b/test/api/test_separable_filter_2d.cpp
index 16ad58f91..7b92916e8 100644
--- a/test/api/test_separable_filter_2d.cpp
+++ b/test/api/test_separable_filter_2d.cpp
@@ -13,6 +13,7 @@
 
 KLEIDICV_API(separable_filter_2d, kleidicv_separable_filter_2d_u8, uint8_t)
 KLEIDICV_API(separable_filter_2d, kleidicv_separable_filter_2d_u16, uint16_t)
+KLEIDICV_API(separable_filter_2d, kleidicv_separable_filter_2d_f32, float)
 
 // Implements KernelTestParams for SeparableFilter2D operators.
 template <typename ElementType, size_t KernelSize>
@@ -36,6 +37,15 @@ struct SeparableFilter2DKernelTestParams<uint16_t, KernelSize> {
   static constexpr size_t kKernelSize = KernelSize;
 };  // end of struct SeparableFilter2DKernelTestParams<uint16_t, KernelSize>
 
+template <size_t KernelSize>
+struct SeparableFilter2DKernelTestParams<float, KernelSize> {
+  using InputType = float;
+  using IntermediateType = float;
+  using OutputType = float;
+
+  static constexpr size_t kKernelSize = KernelSize;
+};  // end of struct SeparableFilter2DKernelTestParams<float, KernelSize>
+
 static constexpr std::array<kleidicv_border_type_t, 1> kDefaultBorder = {
     KLEIDICV_BORDER_TYPE_REPLICATE};
 
@@ -92,10 +102,17 @@ class SeparableFilter2DTest : public test::KernelTest<KernelTestParams> {
     auto kSupportedBorderValues = test::default_border_values();
     // Create generators and execute test.
     test::SequenceGenerator tested_border_values{kSupportedBorderValues};
-    test::PseudoRandomNumberGeneratorIntRange<InputType> element_generator{
-        0, max_value};
-    Base::test(kernel, *array_layout_generator_, *border_type_generator_,
-               tested_border_values, element_generator);
+    if constexpr (std::is_integral_v<InputType>) {
+      test::PseudoRandomNumberGeneratorIntRange<InputType> element_generator{
+          0, max_value};
+      Base::test(kernel, *array_layout_generator_, *border_type_generator_,
+                 tested_border_values, element_generator);
+    } else {
+      test::PseudoRandomNumberGeneratorFloatRange<InputType> element_generator{
+          0, max_value};
+      Base::test(kernel, *array_layout_generator_, *border_type_generator_,
+                 tested_border_values, element_generator);
+    }
   }
 
  protected:
@@ -133,7 +150,7 @@ class SeparableFilter2DTest : public test::KernelTest<KernelTestParams> {
   }
 };  // end of class SeparableFilter2DTest<KernelTestParams>
 
-using ElementTypes = ::testing::Types<uint8_t, uint16_t>;
+using ElementTypes = ::testing::Types<uint8_t, uint16_t, float>;
 
 template <typename ElementType>
 class SeparableFilter2D : public testing::Test {};
@@ -502,6 +519,119 @@ TEST(SeparableFilter2D, 5x5_U16OverflowVector) {
   EXPECT_EQ_ARRAY2D(dst_expected, dst);
 }
 
+TEST(SeparableFilter2D, 5x5_F32MaxInf) {
+  using TypeParam = float;
+
+  kleidicv_filter_context_t *context = nullptr;
+  ASSERT_EQ(KLEIDICV_OK,
+            kleidicv_filter_context_create(&context, 1, 5, 5, 5, 7));
+
+  const float posInfinity = std::numeric_limits<float>::infinity();
+  const float floatMax = std::numeric_limits<float>::max();
+
+  test::Array2D<TypeParam> src{5, 7, test::Options::vector_length()};
+  test::Array2D<TypeParam> kernel_x{5, 1};
+  test::Array2D<TypeParam> kernel_y{5, 1};
+  test::Array2D<TypeParam> dst{5, 7, test::Options::vector_length()};
+  test::Array2D<float> dst_expected{5, 7, test::Options::vector_length()};
+
+  // clang-format off
+  src.set(0, 0, { 1, 1, 1, 1, 1});
+  src.set(1, 0, { 1, 1, 1, 1, 1});
+  src.set(2, 0, { 1, 1, 1, 1, 1});
+  src.set(3, 0, { 1, 1, 1, 1, 1});
+  src.set(4, 0, { 1, 1, 1, 1, 1});
+  src.set(5, 0, { 1, 1, 1, 1, 1});
+  src.set(6, 0, { 1, 1, 1, 1, 1});
+  // clang-format on
+
+  kernel_x.set(0, 0, {floatMax, 1, 1, 1, 1});
+  kernel_y.set(0, 0, {1, 1, 1, 1, 1});
+
+  EXPECT_EQ(KLEIDICV_OK, separable_filter_2d<TypeParam>()(
+                             src.data(), src.stride(), dst.data(), dst.stride(),
+                             5, 7, 1, kernel_x.data(), 5, kernel_y.data(), 5,
+                             KLEIDICV_BORDER_TYPE_REPLICATE, context));
+
+  // clang-format off
+  dst_expected.set(0, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  dst_expected.set(1, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  dst_expected.set(2, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  dst_expected.set(3, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  dst_expected.set(4, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  dst_expected.set(5, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  dst_expected.set(6, 0, { posInfinity, posInfinity, posInfinity, posInfinity, posInfinity});
+  // clang-format on
+
+  EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context));
+  EXPECT_EQ_ARRAY2D(dst_expected, dst);
+}
+
+TEST(SeparableFilter2D, 5x5_F32SpecialValues) {
+  using TypeParam = float;
+
+  kleidicv_filter_context_t *context = nullptr;
+  ASSERT_EQ(KLEIDICV_OK,
+            kleidicv_filter_context_create(&context, 1, 5, 5, 5, 7));
+  test::Array2D<TypeParam> src{5, 7, test::Options::vector_length()};
+  test::Array2D<TypeParam> kernel_x{5, 1};
+  test::Array2D<TypeParam> kernel_y{5, 1};
+  test::Array2D<TypeParam> dst{5, 7, test::Options::vector_length()};
+  test::Array2D<TypeParam> dst_expected{5, 7, test::Options::vector_length()};
+
+  const float quietMinusNaN = test::floatval(0xFFC00001);
+  const float quietPlusNaN = std::numeric_limits<float>::quiet_NaN();
+  const float signalingNaN = std::numeric_limits<float>::signaling_NaN();
+  const float negInfinity = -std::numeric_limits<float>::infinity();
+  const float posInfinity = std::numeric_limits<float>::infinity();
+
+  const float minusNaN = test::floatval(0xFF800001);
+  const float plusNaN = test::floatval(0x7F800001);
+  const float minusZero = -0.0F;
+  const float plusZero = 0.0F;
+
+  const float oneNaN = test::floatval(0x7FC00001);
+  const float zeroDivZero = -std::numeric_limits<float>::quiet_NaN();
+  const float floatMin = std::numeric_limits<float>::min();
+  const float floatMax = std::numeric_limits<float>::max();
+
+  const float posSubnormalMin = std::numeric_limits<float>::denorm_min();
+  const float posSubnormalMax = test::floatval(0x007FFFFF);
+  const float negSubnormalMin = -std::numeric_limits<float>::denorm_min();
+  const float negSubnormalMax = test::floatval(0x807FFFFF);
+
+  // clang-format off
+  src.set(0, 0, { quietPlusNaN, signalingNaN, negInfinity, posInfinity, zeroDivZero });
+  src.set(1, 0, { minusNaN, plusNaN, oneNaN, minusZero, plusZero });
+  src.set(2, 0, { 1111.11, -1112.22, 113.33, floatMin, floatMax });
+  src.set(3, 0, { 114.44, posSubnormalMin, posSubnormalMax, negSubnormalMin, negSubnormalMax });
+  src.set(4, 0, { 111.51, 112.62, 113.73, 114.84, 114.83 });
+  src.set(5, 0, { 126.66, 127.11, 128.66, 129.11, 129.1 });
+  src.set(6, 0, { 11.5, 12.5, -11.5, -12.5, -12.51 });
+  // clang-format on
+
+  kernel_x.set(0, 0, {38, 0, 38, 0, 38});
+  kernel_y.set(0, 0, {38, 0, 38, 0, 38});
+
+  EXPECT_EQ(KLEIDICV_OK, separable_filter_2d<TypeParam>()(
+                             src.data(), src.stride(), dst.data(), dst.stride(),
+                             5, 7, 1, kernel_x.data(), 5, kernel_y.data(), 5,
+                             KLEIDICV_BORDER_TYPE_REPLICATE, context));
+
+  // clang-format off
+  dst_expected.set(0, 0, {quietMinusNaN, quietMinusNaN, quietMinusNaN, oneNaN, quietPlusNaN});
+  dst_expected.set(1, 0, {quietMinusNaN, quietMinusNaN, quietMinusNaN, oneNaN, quietPlusNaN});
+  dst_expected.set(2, 0, {quietMinusNaN, quietMinusNaN, quietMinusNaN, oneNaN, oneNaN});
+  dst_expected.set(3, 0, {quietMinusNaN, quietMinusNaN, quietMinusNaN, oneNaN, oneNaN});
+  dst_expected.set(4, 0, {3875407.25, 504475.8125, posInfinity, quietPlusNaN, quietPlusNaN});
+  dst_expected.set(5, 0, {898687.9375, 734736.125, 702289.5, 538337.625, 505890.96875});
+  dst_expected.set(6, 0, {519479, 522684.6875, 454932.1875, 458137.875, 390385.375});
+  // clang-format on
+
+  EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context));
+  EXPECT_APPROX_EQ_ARRAY2D(1e-4, dst_expected, dst);
+}
+
 TYPED_TEST(SeparableFilter2D, NullPointer) {
   using KernelTestParams = SeparableFilter2DKernelTestParams<TypeParam, 5>;
   kleidicv_filter_context_t *context = nullptr;
diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp
index 1abc6e92e..c70726294 100644
--- a/test/api/test_thread.cpp
+++ b/test/api/test_thread.cpp
@@ -205,6 +205,11 @@ TEST_P(Thread, separable_filter_2d_u16) {
                                       kleidicv_thread_separable_filter_2d_u16);
 }
 
+TEST_P(Thread, separable_filter_2d_f32) {
+  check_separable_filter_2d<float>(kleidicv_separable_filter_2d_f32,
+                                   kleidicv_thread_separable_filter_2d_f32);
+}
+
 TEST_P(Thread, SobelHorizontal1Channel) {
   check_unary_op<uint8_t, int16_t>(kleidicv_sobel_3x3_horizontal_s16_u8,
                                    kleidicv_thread_sobel_3x3_horizontal_s16_u8,
diff --git a/test/framework/array.h b/test/framework/array.h
index 8de46cf34..50bc15783 100644
--- a/test/framework/array.h
+++ b/test/framework/array.h
@@ -7,6 +7,7 @@
 
 #include <gtest/gtest.h>
 
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -214,6 +215,41 @@ class Array2D : public TwoDimensional<ElementType> {
     return std::nullopt;
   }
 
+  // Compares two instances for approximate equality considering only element
+  // bytes. Returns the location of the first mismatch, if any.
+  std::optional<std::tuple<size_t, size_t>> compare_approx_to(
+      ElementType threshold_percent, const Array2D<ElementType> &other) const {
+    for (size_t row = 0; row < height(); ++row) {
+      for (size_t column = 0; column < width(); ++column) {
+        const ElementType *lhs = at(row, column);
+        const ElementType *rhs = other.at(row, column);
+        if (lhs != 0) {
+          ElementType diff = abs_diff<ElementType>(*lhs, *rhs);
+          ElementType diff_percentage = (diff / std::abs(*lhs)) * 100;
+          if (!lhs || !rhs || diff_percentage > threshold_percent) {
+            return std::make_tuple(row, column);
+          }
+          // Special handling to avoid division by 0.
+        } else {
+          // Seems like clang-tidy does not understand what the fill member
+          // function of test::Array2D does, so these exceptions are required
+          // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign)
+          float expected = *(lhs);
+          // NOLINTEND(clang-analyzer-core.uninitialized.Assign)
+
+          float actual = *(rhs);
+          // Error of 1 ULP means that actual is either same as expected, or
+          // the next float value in negative of positive direction
+          if (!lhs || !rhs || (std::nextafterf(actual, expected) != expected)) {
+            return std::make_tuple(row, column);
+          }
+        }
+      }
+    }
+
+    return std::nullopt;
+  }
+
   // Returns a pointer to the first element.
   ElementType *data() { return data_; }
 
@@ -385,6 +421,25 @@ class Array2D : public TwoDimensional<ElementType> {
     }                                                               \
   } while (0 != 0)
 
+// Compares two Array2D objects for approximate equality.
+#define EXPECT_APPROX_EQ_ARRAY2D(threshold_percent, lhs, rhs)            \
+  do {                                                                   \
+    ASSERT_EQ((lhs).width(), (rhs).width())                              \
+        << "Mismatch in width." << std::endl;                            \
+    ASSERT_EQ((lhs).height(), (rhs).height())                            \
+        << "Mismatch in height." << std::endl;                           \
+    ASSERT_EQ((lhs).channels(), (rhs).channels())                        \
+        << "Mismatch in channels." << std::endl;                         \
+    auto mismatch = (lhs).compare_approx_to((threshold_percent), (rhs)); \
+    if (mismatch) {                                                      \
+      auto [row, col] = *mismatch;                                       \
+      GTEST_FAIL() << "Mismatch at (row=" << row << ", col=" << col      \
+                   << "): " << std::hex << std::showbase                 \
+                   << +(lhs).at(row, col)[0] << " vs "                   \
+                   << +(rhs).at(row, col)[0] << "." << std::endl;        \
+    }                                                                    \
+  } while (0 != 0)
+
 // Compares two Array2D objects for inequality.
 #define EXPECT_NE_ARRAY2D(lhs, rhs)                                \
   do {                                                             \
diff --git a/test/framework/border.cpp b/test/framework/border.cpp
index 5a3c6688f..5761c09ec 100644
--- a/test/framework/border.cpp
+++ b/test/framework/border.cpp
@@ -318,4 +318,8 @@ template void prepare_borders<uint16_t>(kleidicv_border_type_t,
                                         const Bordered *,
                                         TwoDimensional<uint16_t> *);
 
+template void prepare_borders<float>(kleidicv_border_type_t,
+                                     kleidicv_border_values_t, const Bordered *,
+                                     TwoDimensional<float> *);
+
 }  // namespace test
diff --git a/test/framework/generator.h b/test/framework/generator.h
index 3870737e3..54a812fe3 100644
--- a/test/framework/generator.h
+++ b/test/framework/generator.h
@@ -61,6 +61,24 @@ class PseudoRandomNumberGeneratorIntRange
   std::uniform_int_distribution<ElementType> dist_;
 };  // end of class PseudoRandomNumberGeneratorIntRange<ElementType>
 
+// Generates pseudo-random floats within the range [min, max].
+template <typename ElementType,
+          std::enable_if_t<std::is_same_v<ElementType, float>, bool> = true>
+class PseudoRandomNumberGeneratorFloatRange
+    : public PseudoRandomNumberGenerator<ElementType> {
+ public:
+  PseudoRandomNumberGeneratorFloatRange(ElementType min, ElementType max)
+      : PseudoRandomNumberGenerator<ElementType>(), dist_(min, max) {}
+
+  // Yields the next value or std::nullopt.
+  std::optional<ElementType> next() override {
+    return static_cast<ElementType>(dist_(this->rng_));
+  }
+
+ protected:
+  std::uniform_real_distribution<ElementType> dist_;
+};  // end of class PseudoRandomNumberGeneratorFloatRange<ElementType>
+
 // Generator which yields values of an iterable container.
 template <typename IterableType>
 class SequenceGenerator : public Generator<typename IterableType::value_type> {
diff --git a/test/framework/kernel.h b/test/framework/kernel.h
index 94ce83439..3c3314950 100644
--- a/test/framework/kernel.h
+++ b/test/framework/kernel.h
@@ -151,8 +151,13 @@ class KernelTest {
     prepare_source(element_generator);
     prepare_expected(kernel, array_layout, border_type, border_values);
     prepare_actual();
-    check_results(
-        this->call_api(&input_, &actual_, border_type, border_values));
+    if constexpr (std::is_same_v<InputType, float>) {
+      check_approx_results(
+          this->call_api(&input_, &actual_, border_type, border_values));
+    } else {
+      check_results(
+          this->call_api(&input_, &actual_, border_type, border_values));
+    }
   }
 
  protected:
@@ -187,9 +192,14 @@ class KernelTest {
         IntermediateType coefficient = kernel.at(height, width)[0];
         InputType value =
             source.at(row + height, column + width * source.channels())[0];
-        result = saturating_add(
-            result,
-            saturating_mul(coefficient, static_cast<IntermediateType>(value)));
+        IntermediateType product{0};
+        if constexpr (std::is_integral_v<InputType>) {
+          product =
+              saturating_mul(coefficient, static_cast<IntermediateType>(value));
+          result = saturating_add(result, product);
+        } else {
+          result += (coefficient * value);
+        }
       }
     }
 
@@ -277,6 +287,18 @@ class KernelTest {
     EXPECT_EQ_ARRAY2D(expected_, actual_);
   }
 
+  void check_approx_results(kleidicv_error_t err) {
+    if (debug_) {
+      std::cout << "[actual]" << std::endl;
+      dump(&actual_);
+    }
+
+    EXPECT_EQ(KLEIDICV_OK, err);
+
+    // Check that the actual result matches the expectation with less precision.
+    EXPECT_APPROX_EQ_ARRAY2D(1e-4, expected_, actual_);
+  }
+
   // Input operand for the operation.
   Array2D<InputType> input_;
   // Input operand with borders, used to calculate expected values.
diff --git a/test/framework/utils.cpp b/test/framework/utils.cpp
index 77d00e149..eaedb2f6f 100644
--- a/test/framework/utils.cpp
+++ b/test/framework/utils.cpp
@@ -20,6 +20,13 @@ bool test::Options::are_long_running_tests_skipped_ = true;
 
 namespace test {
 
+float floatval(uint32_t v) {
+  float result;  // Avoid cppcoreguidelines-init-variables. NOLINT
+  static_assert(sizeof(result) == sizeof(v));
+  memcpy(&result, &v, sizeof(result));
+  return result;
+}
+
 template <typename ElementType>
 void dump(const TwoDimensional<ElementType> *elements) {
   if (!elements) {
diff --git a/test/framework/utils.h b/test/framework/utils.h
index 622dc527a..d645a3937 100644
--- a/test/framework/utils.h
+++ b/test/framework/utils.h
@@ -109,6 +109,8 @@ class Options {
   static bool are_long_running_tests_skipped_;
 };  // end of class Options
 
+float floatval(uint32_t v);
+
 // Prints all the elements in a two-dimensional space.
 template <typename ElementType>
 void dump(const TwoDimensional<ElementType> *elements);
@@ -206,6 +208,11 @@ T saturating_mul(T a, T b) {
   return result;
 }
 
+template <typename T>
+auto abs_diff(T a, T b) {
+  return a > b ? a - b : b - a;
+}
+
 }  // namespace test
 
 #endif  // KLEIDICV_TEST_FRAMEWORK_UTILS_H_
-- 
GitLab