diff --git a/CHANGELOG.md b/CHANGELOG.md
index 096a534e628ac7f4b4cf5916d4ac6016de526359..07f0f59a7db5fa4244aab38e747434395f1bfa60 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ This changelog aims to follow the guiding principles of
 - Exponential function for float.
 - Bitwise and.
 - Gaussian Blur for 7x7 kernels.
+- Scale function for float.
 
 ### Fixed
 
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 627fb545d1af976c558545610688ab18559641bb..1ae98c562cb7aff84cc590b4cab97cd364f9fbd0 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -699,11 +699,19 @@ int convertTo(const uchar *src_data, size_t src_step, int src_depth,
       return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
 
-    if (src_depth == CV_8U) {
-      return convert_error(kleidicv_scale_u8(
-          reinterpret_cast<const uint8_t *>(src_data), src_step,
-          reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height,
-          static_cast<float>(scale), static_cast<float>(shift)));
+    switch (src_depth) {
+      case CV_8U:
+        return convert_error(kleidicv_scale_u8(
+            reinterpret_cast<const uint8_t *>(src_data), src_step,
+            reinterpret_cast<uint8_t *>(dst_data), dst_step, width, height,
+            static_cast<float>(scale), static_cast<float>(shift)));
+      case CV_32F:
+        return convert_error(kleidicv_scale_f32(
+            reinterpret_cast<const float *>(src_data), src_step,
+            reinterpret_cast<float *>(dst_data), dst_step, width, height,
+            static_cast<float>(scale), static_cast<float>(shift)));
+      default:
+        break;
     }
   }
 
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 780400ec9e64080a03fcf3cad5c1a2a1928c2041..4ea11cca07bb96cfd28de67d40a0d62a1571467a 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -125,6 +125,37 @@ static void min_max_loc_u8(benchmark::State& state) {
 }
 BENCHMARK(min_max_loc_u8);
 
+template <typename T, typename Function>
+static void scale(Function f, float factor, float shift,
+                  benchmark::State& state) {
+  // Setup
+  std::vector<T> src, dst;
+  src.resize(image_width * image_height);
+  dst.resize(image_width * image_height);
+
+  std::mt19937 generator;
+  std::generate(src.begin(), src.end(), generator);
+
+  for (auto _ : state) {
+    // This code gets benchmarked
+    auto unused =
+        f(src.data(), image_width * sizeof(T), dst.data(),
+          image_width * sizeof(T), image_width, image_height, factor, shift);
+    (void)unused;
+  }
+}
+
+#define BENCH_SCALE(benchname, name, factor, shift, type) \
+  static void benchname(benchmark::State& state) {        \
+    scale<type>(kleidicv_##name, factor, shift, state);   \
+  }                                                       \
+  BENCHMARK(benchname)
+
+BENCH_SCALE(scale_u8_1, scale_u8, 1.0, 4.567, uint8_t);
+BENCH_SCALE(scale_u8_generic, scale_u8, 1.234, 4.567, uint8_t);
+BENCH_SCALE(scale_f32_1, scale_f32, 1.0, 4.567, float);
+BENCH_SCALE(scale_f32_generic, scale_f32, 1.234, 4.567, float);
+
 template <typename T, typename F>
 static void resize_linear(F f, size_t scale_x, size_t scale_y,
                           benchmark::State& state) {
diff --git a/conformity/opencv/CMakeLists.txt b/conformity/opencv/CMakeLists.txt
index 7bb122fd65e206a05b9cea6b5662a44370490fc1..360a57639ffe1eb991934d8f618352ec2330f83a 100644
--- a/conformity/opencv/CMakeLists.txt
+++ b/conformity/opencv/CMakeLists.txt
@@ -37,6 +37,7 @@ add_executable(
   test_sobel.cpp
   test_exp.cpp
   test_float_conv.cpp
+  test_scale.cpp
 )
 
 target_link_libraries(
@@ -78,7 +79,7 @@ add_executable(
   test_sobel.cpp
   test_exp.cpp
   test_float_conv.cpp
-
+  test_scale.cpp
 )
 
 target_link_libraries(
diff --git a/conformity/opencv/test_scale.cpp b/conformity/opencv/test_scale.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f669f1fa35b1e36a09c537aec8ae72b68f3650e1
--- /dev/null
+++ b/conformity/opencv/test_scale.cpp
@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "test_scale.h"
+
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "opencv2/core/hal/interface.h"
+
+template <int Scale, int Shift>
+cv::Mat exec_scale(cv::Mat& input_mat) {
+  cv::Mat result;
+  input_mat.convertTo(result, -1, Scale / 1000.0, Shift / 1000.0);
+  return result;
+}
+
+#if MANAGER
+template <int Scale, int Shift, size_t Format>
+bool test_scale(int index, RecreatedMessageQueue& request_queue,
+                RecreatedMessageQueue& reply_queue) {
+  cv::RNG rng(0);
+
+  for (size_t x = 5; x <= 16; ++x) {
+    for (size_t y = 5; y <= 16; ++y) {
+      cv::Mat input_mat(x, y, Format);
+      rng.fill(input_mat, cv::RNG::NORMAL, 0.0, 1.0e10);
+      cv::Mat actual_mat = exec_scale<Scale, Shift>(input_mat);
+      cv::Mat expected_mat = get_expected_from_subordinate(
+          index, request_queue, reply_queue, input_mat);
+
+      bool success =
+          (CV_MAT_DEPTH(Format) == CV_32F &&
+           !are_float_matrices_different<float>(1e-5, actual_mat,
+                                                expected_mat)) ||
+          (CV_MAT_DEPTH(Format) == CV_8U &&
+           !are_matrices_different<uint8_t>(0, actual_mat, expected_mat));
+      if (!success) {
+        fail_print_matrices(x, y, input_mat, actual_mat, expected_mat);
+      }
+    }
+  }
+
+  return false;
+}
+#endif
+
+std::vector<test>& scale_tests_get() {
+  // clang-format off
+  static std::vector<test> tests = {
+    TEST("Scale float32, scale=1.0, shift=2.0", (test_scale<1000, 2000, CV_32FC4>), (exec_scale<1000, 2000>)),
+    TEST("Scale float32, scale=-10.0, shift=0.0", (test_scale<(-10000), 0, CV_32FC4>), (exec_scale<(-10000), 0>)),
+    TEST("Scale float32, scale=3.14, shift=2.72", (test_scale<3140, 2720, CV_32FC4>), (exec_scale<3140, 2720>)),
+    TEST("Scale float32, scale=7e5, shift=8e-3", (test_scale<700000000, 8, CV_32FC4>), (exec_scale<700000000, 8>)),
+    TEST("Scale float32, scale=1e-3, shift=8e-3", (test_scale<1, 8, CV_32FC4>), (exec_scale<1, 8>)),
+
+    TEST("Scale uint8, scale=1.0, shift=2.0", (test_scale<1000, 2000, CV_8UC4>), (exec_scale<1000, 2000>)),
+    TEST("Scale uint8, scale=-10.0, shift=0.0", (test_scale<(-10000), 0, CV_8UC4>), (exec_scale<(-10000), 0>)),
+    TEST("Scale uint8, scale=3.14, shift=-2.72", (test_scale<3140, -2720, CV_8UC4>), (exec_scale<3140, -2720>)),
+    TEST("Scale uint8, scale=17.17, shift=3.9", (test_scale<17170, 3900, CV_8UC4>), (exec_scale<17170, 3900>)),
+    TEST("Scale uint8, scale=0.13, shift=230", (test_scale<130, 230000, CV_8UC4>), (exec_scale<130, 230000>)),
+  };
+  // clang-format on
+  return tests;
+}
diff --git a/conformity/opencv/test_scale.h b/conformity/opencv/test_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..86916230aac41a51bb6c6bec78e26ca71b070401
--- /dev/null
+++ b/conformity/opencv/test_scale.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_CONFORMITY_TEST_SCALE_H_
+#define KLEIDICV_OPENCV_CONFORMITY_TEST_SCALE_H_
+
+#include <vector>
+
+#include "tests.h"
+
+std::vector<test>& scale_tests_get();
+
+#endif  // KLEIDICV_OPENCV_CONFORMITY_TEST_SCALE_H_
diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp
index 8064a6565ec987aa949cc235ed83f686fceede89..b7f3e38aaef207c3f40199bb127dae6e7d0fef66 100644
--- a/conformity/opencv/tests.cpp
+++ b/conformity/opencv/tests.cpp
@@ -16,6 +16,7 @@
 #include "test_gaussian_blur.h"
 #include "test_min_max.h"
 #include "test_rgb2yuv.h"
+#include "test_scale.h"
 #include "test_sobel.h"
 
 static std::vector<test> merge_tests(
@@ -28,15 +29,10 @@ static std::vector<test> merge_tests(
   return all_tests;
 }
 
-std::vector<test> all_tests = merge_tests({
-    binary_op_tests_get,
-    gaussian_blur_tests_get,
-    min_max_tests_get,
-    rgb2yuv_tests_get,
-    sobel_tests_get,
-    exp_tests_get,
-    float_conversion_tests_get,
-});
+std::vector<test> all_tests =
+    merge_tests({binary_op_tests_get, gaussian_blur_tests_get,
+                 min_max_tests_get, rgb2yuv_tests_get, sobel_tests_get,
+                 exp_tests_get, float_conversion_tests_get, scale_tests_get});
 
 #if MANAGER
 void fail_print_matrices(size_t height, size_t width, cv::Mat& input,
diff --git a/doc/functionality.md b/doc/functionality.md
index d09a7a721aeb2067c11b4b35e1b151af97584c46..da7e88899710a16e5f070d2e142fdff1ac78eac9 100644
--- a/doc/functionality.md
+++ b/doc/functionality.md
@@ -18,7 +18,7 @@ See `doc/opencv.md` for details of the functionality available in OpenCV.
 | Saturating Multiply          |  x  |  x  |  x  |  x  |  x  |     |     |     |     |     |
 | Threshold binary             |     |  x  |     |     |     |     |     |     |     |     |
 | SaturatingAddAbsWithThreshold|     |     |  x  |     |     |     |     |     |     |     |
-| Scale                        |     |  x  |     |     |     |     |     |     |     |     |
+| Scale                        |     |  x  |     |     |     |     |     |     |  x  |     |
 | CompareEqual                 |     |  x  |     |     |     |     |     |     |     |     |
 | CompareGreater               |     |  x  |     |     |     |     |     |     |     |     |
 
diff --git a/doc/opencv.md b/doc/opencv.md
index 57f7d7e32a228d4d9df5a178b1564d5b6cde50c6..a2047eea07f435ce7dbb3a008ba776aae1f84656 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -167,7 +167,9 @@ Notes on parameters:
   + `CV_32S`
 
 ### `convertTo`
-This function will scale given input using `scale` and `shift` if they are significant enough, and if `src_depth` and `dst_depth` are equal to `CV_8U`.
+This function will scale given input using `scale` and `shift` if they are significant enough, and if `src_depth` equals `dst_depth`. Supported depths:
+  + `CV_8U`
+  + `CV_32F`
 
 Additionally, it is able to convert between data types as follows:
 
@@ -190,4 +192,4 @@ Notes on parameters:
 * `operation` - flag specifying correspondence between the arrays.
 Supported [OpenCV cmp types](https://docs.opencv.org/5.x/d2/de8/group__core__array.html#ga0cc47ff833d40b58ecbe1d609a53d784) are:
   + `cv::CMP_EQ `
-  + `cv::CMP_GT`
\ No newline at end of file
+  + `cv::CMP_GT`
diff --git a/kleidicv/include/kleidicv/kleidicv.h b/kleidicv/include/kleidicv/kleidicv.h
index b27f585826f59b5ce5d1aa7cb6cf0e7ccd0c4dcf..134ca5ed81987fd7551b6cfc8593d9227eb6a35f 100644
--- a/kleidicv/include/kleidicv/kleidicv.h
+++ b/kleidicv/include/kleidicv/kleidicv.h
@@ -1460,6 +1460,10 @@ KLEIDICV_API_DECLARATION(kleidicv_min_max_loc_u8, const uint8_t *src,
 KLEIDICV_API_DECLARATION(kleidicv_scale_u8, const uint8_t *src,
                          size_t src_stride, uint8_t *dst, size_t dst_stride,
                          size_t width, size_t height, float scale, float shift);
+/// @copydoc kleidicv_scale_u8
+KLEIDICV_API_DECLARATION(kleidicv_scale_f32, const float *src,
+                         size_t src_stride, float *dst, size_t dst_stride,
+                         size_t width, size_t height, float scale, float shift);
 
 /// Exponential function, input is the elements in `src`, output is the elements
 /// in `dst`.
diff --git a/kleidicv/src/arithmetics/scale_api.cpp b/kleidicv/src/arithmetics/scale_api.cpp
index d7d22964b8f3e32097479e79f01b1816b4d8b0ad..ef6ed8584dab2c57561d0396223fcba80cef5889 100644
--- a/kleidicv/src/arithmetics/scale_api.cpp
+++ b/kleidicv/src/arithmetics/scale_api.cpp
@@ -16,18 +16,27 @@ kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
                        float scale, float shift);
 }  // namespace neon
 
-namespace sve2 {}  // namespace sve2
+namespace sve2 {
 
-namespace sme2 {}  // namespace sme2
+template <typename T>
+kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       float scale, float shift);
 
-}  // namespace kleidicv
+}  // namespace sve2
+
+namespace sme2 {
+template <typename T>
+kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       float scale, float shift);
+
+}  // namespace sme2
 
-#define KLEIDICV_DEFINE_SCALE_API(name, type)                              \
-  KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::scale<type>, nullptr, \
-                              nullptr)
+}  // namespace kleidicv
 
-KLEIDICV_DEFINE_SCALE_API(kleidicv_scale_u8, uint8_t);
-// KLEIDICV_DEFINE_SCALE_API(kleidicv_scale_s8, int8_t);
-// KLEIDICV_DEFINE_SCALE_API(kleidicv_scale_u16, uint16_t);
-// KLEIDICV_DEFINE_SCALE_API(kleidicv_scale_s16, int16_t);
-// KLEIDICV_DEFINE_SCALE_API(kleidicv_scale_s32, int32_t);
+KLEIDICV_MULTIVERSION_C_API(kleidicv_scale_u8, &kleidicv::neon::scale<uint8_t>,
+                            nullptr, nullptr);
+KLEIDICV_MULTIVERSION_C_API(kleidicv_scale_f32, &kleidicv::neon::scale<float>,
+                            &kleidicv::sve2::scale<float>,
+                            &kleidicv::sme2::scale<float>);
diff --git a/kleidicv/src/arithmetics/scale_neon.cpp b/kleidicv/src/arithmetics/scale_neon.cpp
index 9c29957314a860cad99ff4a03a148db56fce133b..7c46ec0454964b852f9486aa83e064d7edbab87f 100644
--- a/kleidicv/src/arithmetics/scale_neon.cpp
+++ b/kleidicv/src/arithmetics/scale_neon.cpp
@@ -8,12 +8,15 @@
 
 #include "kleidicv/kleidicv.h"
 #include "kleidicv/neon.h"
+#include "kleidicv/traits.h"
 
 namespace kleidicv::neon {
 
 // Scale algorithm: for each value in the source,
 //   dst[i] = src[i] * scale + shift   (floating point operation)
 //
+// Unsigned 8-bit implementation
+//
 // Since converting from uint8 to float32 and back takes more steps,
 // 'ScaleTbx' saves time by pre-calculating all 256 values and uses TBLs
 // and TBXs to map the values directly from uint8 to uint8:
@@ -51,9 +54,9 @@ namespace kleidicv::neon {
 // size and speed.
 
 template <typename ScalarType>
-class ScaleBase : public UnrollTwice {
+class ScaleIntBase : public UnrollTwice {
  public:
-  ScaleBase(float scale, float shift) : scale_{scale}, shift_{shift} {}
+  ScaleIntBase(float scale, float shift) : scale_{scale}, shift_{shift} {}
 
  protected:
   static constexpr ScalarType ScalarMax =
@@ -70,15 +73,16 @@ class ScaleBase : public UnrollTwice {
   float scale_, shift_;
 };
 
-template <typename ScalarType>
-class ScaleTbx final : public ScaleBase<ScalarType> {
+class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
  public:
+  using ScalarType = uint8_t;
   using VecTraits = neon::VecTraits<ScalarType>;
   using VectorType = typename VecTraits::VectorType;
   using Vector2Type = typename VecTraits::Vector2Type;
   using Vector3Type = typename VecTraits::Vector3Type;
 
-  ScaleTbx(float scale, float shift) : ScaleBase<ScalarType>(scale, shift) {
+  ScaleUint8Tbx(float scale, float shift)
+      : ScaleIntBase<uint8_t>(scale, shift) {
     constexpr size_t TableLength = 1 << (CHAR_BIT * sizeof(ScalarType));
     ScalarType values[TableLength];
     for (size_t i = 0; i < TableLength; ++i) {
@@ -112,21 +116,21 @@ class ScaleTbx final : public ScaleBase<ScalarType> {
   ScalarType scalar_path(ScalarType src) { return this->scale_value(src); }
 
  private:
-  Vector3Type t0_3_, t1_3_, t3_3_, t5_3_;
-  Vector2Type t2_2_, t4_2_;
+  Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{};
+  Vector2Type t2_2_{}, t4_2_{};
   VectorType v_step3_, v_step2_;
-};  // end of class ScaleTbx<T>
+};  // end of class ScaleUint8Tbx<T>
 
-// Opposite to ScaleTbx, ScaleFloat is the direct approach:
+// Opposite to ScaleUint8Tbx, ScaleUint8Calc is the direct approach:
 // - calculate dst[i] = src[i] * scale + shift  using vector instructions
-template <typename ScalarType>
-class ScaleFloat final : public ScaleBase<ScalarType> {
+class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
  public:
+  using ScalarType = uint8_t;
   using VecTraits = neon::VecTraits<ScalarType>;
   using VectorType = typename VecTraits::VectorType;
 
-  ScaleFloat(float scale, float shift)
-      : ScaleBase<ScalarType>(scale, shift),
+  ScaleUint8Calc(float scale, float shift)
+      : ScaleIntBase<ScalarType>(scale, shift),
         vscale_{vdupq_n_f32(scale)},
         vshift_{vdupq_n_f32(shift)} {}
 
@@ -166,10 +170,68 @@ class ScaleFloat final : public ScaleBase<ScalarType> {
   }
 
   float32x4_t vscale_, vshift_;
-};  // end of class ScaleFloat<T>
+};  // end of class ScaleUint8Calc<T>
+
+// -----------------------------------------------------------------------
+// Float implementation
+// -----------------------------------------------------------------------
+
+class AddFloat final : public UnrollTwice,
+                       public UnrollOnce,
+                       public TryToAvoidTailLoop {
+ public:
+  using ScalarType = float;
+  using VecTraits = neon::VecTraits<ScalarType>;
+  using VectorType = typename VecTraits::VectorType;
+
+  explicit AddFloat(float shift) : shift_{shift}, vshift_{vdupq_n_f32(shift)} {}
+
+  VectorType vector_path(VectorType src) { return vaddq_f32(vshift_, src); }
+
+  // NOLINTBEGIN(readability-make-member-function-const)
+  ScalarType scalar_path(ScalarType src) { return src + shift_; }
+  // NOLINTEND(readability-make-member-function-const)
+
+ private:
+  float shift_;
+  float32x4_t vshift_;
+};  // end of class AddFloat
+
+class ScaleFloat final : public UnrollTwice,
+                         public UnrollOnce,
+                         public TryToAvoidTailLoop {
+ public:
+  using ScalarType = float;
+  using VecTraits = neon::VecTraits<ScalarType>;
+  using VectorType = typename VecTraits::VectorType;
+
+  ScaleFloat(float scale, float shift)
+      : scale_{scale},
+        shift_{shift},
+        vscale_{vdupq_n_f32(scale)},
+        vshift_{vdupq_n_f32(shift)} {}
+
+  VectorType vector_path(VectorType src) {
+    return vmlaq_f32(vshift_, src, vscale_);
+  }
+
+  // NOLINTBEGIN(readability-make-member-function-const)
+  ScalarType scalar_path(ScalarType src) { return src * scale_ + shift_; }
+  // NOLINTEND(readability-make-member-function-const)
+
+ private:
+  float scale_, shift_;
+  float32x4_t vscale_, vshift_;
+};  // end of class ScaleFloat
 
 template <typename T>
 kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       float scale, float shift);
+
+// Specialization for uint8_t
+template <>
+kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst,
                        size_t dst_stride, size_t width, size_t height,
                        float scale, float shift) {
   CHECK_POINTER_AND_STRIDE(src, src_stride, height);
@@ -177,30 +239,41 @@ kleidicv_error_t scale(const T *src, size_t src_stride, T *dst,
   CHECK_IMAGE_SIZE(width, height);
 
   Rectangle rect{width, height};
-  Rows<const T> src_rows{src, src_stride};
-  Rows<T> dst_rows{dst, dst_stride};
+  Rows<const uint8_t> src_rows{src, src_stride};
+  Rows<uint8_t> dst_rows{dst, dst_stride};
   // For smaller inputs, the full calculation is the faster
   if (width * height < 2500) {  // empirical value
-    ScaleFloat<T> operation(scale, shift);
+    ScaleUint8Calc operation(scale, shift);
     apply_operation_by_rows(operation, rect, src_rows, dst_rows);
   } else {
     // For bigger inputs, it's faster to pre-calculate the table
     // and map those values during the run
-    ScaleTbx<T> operation(scale, shift);
+    ScaleUint8Tbx operation(scale, shift);
     apply_operation_by_rows(operation, rect, src_rows, dst_rows);
   }
   return KLEIDICV_OK;
 }
 
-#define KLEIDICV_INSTANTIATE_TEMPLATE(type)                             \
-  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t scale<type>(       \
-      const type *src, size_t src_stride, type *dst, size_t dst_stride, \
-      size_t width, size_t height, float scale, float shift)
+// Specialization for float
+template <>
+kleidicv_error_t scale(const float *src, size_t src_stride, float *dst,
+                       size_t dst_stride, size_t width, size_t height,
+                       float scale, float shift) {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
+  CHECK_IMAGE_SIZE(width, height);
 
-KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t);
-// KLEIDICV_INSTANTIATE_TEMPLATE(int8_t);
-// KLEIDICV_INSTANTIATE_TEMPLATE(int16_t);
-// KLEIDICV_INSTANTIATE_TEMPLATE(uint16_t);
-// KLEIDICV_INSTANTIATE_TEMPLATE(int32_t);
+  Rectangle rect{width, height};
+  Rows<const float> src_rows{src, src_stride};
+  Rows<float> dst_rows{dst, dst_stride};
+  if (scale == 1.0) {
+    AddFloat operation(shift);
+    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  } else {
+    ScaleFloat operation(scale, shift);
+    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  }
+  return KLEIDICV_OK;
+}
 
 }  //  namespace kleidicv::neon
diff --git a/kleidicv/src/arithmetics/scale_sc.h b/kleidicv/src/arithmetics/scale_sc.h
new file mode 100644
index 0000000000000000000000000000000000000000..269c15b4c6a18f35628a42fa753845f156fa16ef
--- /dev/null
+++ b/kleidicv/src/arithmetics/scale_sc.h
@@ -0,0 +1,88 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_SCALE_SC_H
+#define KLEIDICV_SCALE_SC_H
+
+#include "kleidicv/kleidicv.h"
+#include "kleidicv/sve2.h"
+
+namespace KLEIDICV_TARGET_NAMESPACE {
+
+class AddFloat final : public UnrollTwice {
+ public:
+  using ContextType = Context;
+  using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>;
+  using VectorType = typename VecTraits::VectorType;
+
+  explicit AddFloat(const svfloat32_t &svshift) KLEIDICV_STREAMING_COMPATIBLE
+      : svshift_{svshift} {}
+
+  // NOLINTBEGIN(readability-make-member-function-const)
+  VectorType vector_path(ContextType ctx,
+                         VectorType src) KLEIDICV_STREAMING_COMPATIBLE {
+    return svadd_x(ctx.predicate(), src, svshift_);
+  }
+  // NOLINTEND(readability-make-member-function-const)
+
+ private:
+  const svfloat32_t &svshift_;
+};  // end of class AddFloat
+
+class ScaleFloat final : public UnrollTwice {
+ public:
+  using ContextType = Context;
+  using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>;
+  using VectorType = typename VecTraits::VectorType;
+
+  ScaleFloat(const svfloat32_t &svscale,
+             const svfloat32_t &svshift) KLEIDICV_STREAMING_COMPATIBLE
+      : svscale_{svscale},
+        svshift_{svshift} {}
+
+  // NOLINTBEGIN(readability-make-member-function-const)
+  VectorType vector_path(ContextType ctx,
+                         VectorType src) KLEIDICV_STREAMING_COMPATIBLE {
+    return svmla_x(ctx.predicate(), svshift_, src, svscale_);
+  }
+  // NOLINTEND(readability-make-member-function-const)
+
+ private:
+  const svfloat32_t &svscale_, &svshift_;
+};  // end of class ScaleFloat
+
+template <typename T>
+kleidicv_error_t scale_sc(const T *src, size_t src_stride, T *dst,
+                          size_t dst_stride, size_t width, size_t height,
+                          float scale,
+                          float shift) KLEIDICV_STREAMING_COMPATIBLE;
+
+// Specialization for float
+template <>
+kleidicv_error_t scale_sc(const float *src, size_t src_stride, float *dst,
+                          size_t dst_stride, size_t width, size_t height,
+                          float scale,
+                          float shift) KLEIDICV_STREAMING_COMPATIBLE {
+  CHECK_POINTER_AND_STRIDE(src, src_stride, height);
+  CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
+  CHECK_IMAGE_SIZE(width, height);
+
+  Rectangle rect{width, height};
+  Rows<const float> src_rows{src, src_stride};
+  Rows<float> dst_rows{dst, dst_stride};
+  svfloat32_t svscale = svdup_f32(scale);
+  svfloat32_t svshift = svdup_f32(shift);
+  if (scale == 1.0) {
+    AddFloat operation(svshift);
+    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  } else {
+    ScaleFloat operation(svscale, svshift);
+    apply_operation_by_rows(operation, rect, src_rows, dst_rows);
+  }
+  return KLEIDICV_OK;
+}
+
+}  // namespace KLEIDICV_TARGET_NAMESPACE
+
+#endif  // KLEIDICV_SCALE_SC_H
diff --git a/kleidicv/src/arithmetics/scale_sme2.cpp b/kleidicv/src/arithmetics/scale_sme2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c1b6822884f94368b9e3bdc049850cf6356a72a
--- /dev/null
+++ b/kleidicv/src/arithmetics/scale_sme2.cpp
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scale_sc.h"
+
+namespace kleidicv::sme2 {
+
+template <typename T>
+KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
+scale(const T* src, size_t src_stride, T* dst, size_t dst_stride, size_t width,
+      size_t height, float scale, float shift) {
+  return scale_sc<T>(src, src_stride, dst, dst_stride, width, height, scale,
+                     shift);
+}
+
+#define KLEIDICV_INSTANTIATE_TEMPLATE(type)                             \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t scale<type>(       \
+      const type* src, size_t src_stride, type* dst, size_t dst_stride, \
+      size_t width, size_t height, float scale, float shift)
+
+KLEIDICV_INSTANTIATE_TEMPLATE(float);
+
+}  // namespace kleidicv::sme2
diff --git a/kleidicv/src/arithmetics/scale_sve2.cpp b/kleidicv/src/arithmetics/scale_sve2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba7b1c32e2162fdc1078c1ebd76673419b666b5d
--- /dev/null
+++ b/kleidicv/src/arithmetics/scale_sve2.cpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scale_sc.h"
+
+namespace kleidicv::sve2 {
+
+template <typename T>
+KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t scale(const T* src, size_t src_stride,
+                                                T* dst, size_t dst_stride,
+                                                size_t width, size_t height,
+                                                float scale, float shift) {
+  return scale_sc<T>(src, src_stride, dst, dst_stride, width, height, scale,
+                     shift);
+}
+
+#define KLEIDICV_INSTANTIATE_TEMPLATE(type)                             \
+  template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t scale<type>(       \
+      const type* src, size_t src_stride, type* dst, size_t dst_stride, \
+      size_t width, size_t height, float scale, float shift)
+
+KLEIDICV_INSTANTIATE_TEMPLATE(float);
+
+}  // namespace kleidicv::sve2
diff --git a/scripts/benchmark/run_benchmarks_4K.sh b/scripts/benchmark/run_benchmarks_4K.sh
index d51b6a72ff82a8ab13d36f6dfe766879cdae9f0e..dc20bddcaa125a413237be32f74fa4f75fc5a359 100755
--- a/scripts/benchmark/run_benchmarks_4K.sh
+++ b/scripts/benchmark/run_benchmarks_4K.sh
@@ -52,5 +52,7 @@ RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Resize4x4_8b op
 RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Resize4x4_float opencv_perf_imgproc '*resizeUpLinearNonExact*' '(32FC1, (960x540, 3840x2160))')")
 
 RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Scale opencv_perf_core '*convertTo*' '(3840x2160, 8UC1, 8UC1, 1, 1.234, 4.567)')")
+RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Scale_float_1.0 opencv_perf_core '*convertTo*' '(1920x1080, 32FC1, 32FC1, 1, 1, 4.567)')")
+RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Scale_float opencv_perf_core '*convertTo*' '(1920x1080, 32FC1, 32FC1, 1, 1.234, 4.567)')")
 
 echo "$RES"
diff --git a/scripts/benchmark/run_benchmarks_FHD.sh b/scripts/benchmark/run_benchmarks_FHD.sh
index 2f21a29bd18e4ea46f4b8f59d2676ee4edddecaa..53f426b960c8b7865f633f52d5e6979e69aa6dc5 100755
--- a/scripts/benchmark/run_benchmarks_FHD.sh
+++ b/scripts/benchmark/run_benchmarks_FHD.sh
@@ -52,5 +52,7 @@ RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Resize4x4_8b op
 RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Resize4x4_float opencv_perf_imgproc '*resizeUpLinearNonExact*' '(32FC1, (480x270, 1920x1080))')")
 
 RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Scale opencv_perf_core '*convertTo*' '(1920x1080, 8UC1, 8UC1, 1, 1.234, 4.567)')")
+RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Scale_float_1.0 opencv_perf_core '*convertTo*' '(1920x1080, 32FC1, 32FC1, 1, 1, 4.567)')")
+RES=$(printf "${RES}\n$(${DEV_DIR}/perf_test_op.sh $CPU $THERMAL Scale_float opencv_perf_core '*convertTo*' '(1920x1080, 32FC1, 32FC1, 1, 1.234, 4.567)')")
 
 echo "$RES"
diff --git a/test/api/test_scale.cpp b/test/api/test_scale.cpp
index 3c7232e3a4733fd18877d2acd1812b688ee5ac5e..39663a2571881ab681e4fe5255fc8ecfddfd53e3 100644
--- a/test/api/test_scale.cpp
+++ b/test/api/test_scale.cpp
@@ -4,53 +4,90 @@
 
 #include <gtest/gtest.h>
 
+#include <algorithm>
+#include <limits>
+
 #include "framework/array.h"
 #include "framework/generator.h"
 #include "framework/operation.h"
 #include "kleidicv/kleidicv.h"
 #include "test_config.h"
 
-#define KLEIDICV_SCALE(type, suffix) \
-  KLEIDICV_API(scale, kleidicv_scale_##suffix, type)
+template <typename DestinationType, typename SourceType>
+static DestinationType saturating_cast(SourceType value) {
+  if (value >
+      static_cast<SourceType>(std::numeric_limits<DestinationType>::max())) {
+    return std::numeric_limits<DestinationType>::max();
+  }
+  if (value < std::numeric_limits<DestinationType>::lowest()) {
+    return std::numeric_limits<DestinationType>::lowest();
+  }
+  return static_cast<DestinationType>(value);
+}
 
-KLEIDICV_SCALE(uint8_t, u8);
+uint8_t scalar_scale_u8(uint8_t x, float scale, float shift) {
+  float result = static_cast<float>(x) * scale + shift;
+  if (result < std::numeric_limits<uint8_t>::min()) {
+    return std::numeric_limits<uint8_t>::min();
+  }
+  if (result > std::numeric_limits<uint8_t>::max()) {
+    return std::numeric_limits<uint8_t>::max();
+  }
+  return static_cast<uint8_t>(lrintf(result));
+}
+
+float scalar_scale_f32(float x, float scale, float shift) {
+  return x * scale + shift;
+}
+
+#define KLEIDICV_SCALE_API(type, suffix) \
+  KLEIDICV_API(scale_api, kleidicv_scale_##suffix, type)
+
+#define KLEIDICV_SCALE_OPERATION(type, suffix) \
+  KLEIDICV_API(scale_operation, &scalar_scale_##suffix, type)
+
+KLEIDICV_SCALE_API(uint8_t, u8);
+KLEIDICV_SCALE_OPERATION(uint8_t, u8);
+KLEIDICV_SCALE_API(float, f32);
+KLEIDICV_SCALE_OPERATION(float, f32);
 
 template <typename ElementType>
 class ScaleTestBase : public UnaryOperationTest<ElementType> {
  protected:
   using UnaryOperationTest<ElementType>::min;
   using UnaryOperationTest<ElementType>::max;
+  using typename UnaryOperationTest<ElementType>::Elements;
 
   // Calls the API-under-test in the appropriate way.
   kleidicv_error_t call_api() override {
-    return kleidicv_scale_u8(this->inputs_[0].data(), this->inputs_[0].stride(),
-                             this->actual_[0].data(), this->actual_[0].stride(),
-                             this->width(), this->height(), this->scale(),
-                             this->shift());
+    return scale_api<ElementType>()(
+        this->inputs_[0].data(), this->inputs_[0].stride(),
+        this->actual_[0].data(), this->actual_[0].stride(), this->width(),
+        this->height(), this->scale(), this->shift());
   }
   virtual float scale() = 0;
   virtual float shift() = 0;
 
   // Prepares expected outputs for the operation.
   void setup() override {
-    ElementType expected = 0;
-    if (shift() < min()) {
-      expected = min();
-    } else if (shift() > max()) {
-      expected = max();
-    } else {
-      expected = lrintf(shift());
-    }
-    this->expected_[0].fill(expected);
+    this->expected_[0].fill(
+        scale_operation<ElementType>()(0, scale(), shift()));
     UnaryOperationTest<ElementType>::setup();
   }
+
+  void fill_expected(std::vector<Elements>& elements) {
+    for (size_t i = 0; i < elements.size(); ++i) {
+      elements[i].values[1] = scale_operation<ElementType>()(
+          elements[i].values[0], scale(), shift());
+    }
+  }
 };  // end of class ScaleTestBase<ElementType>
 
 template <typename ElementType>
 class ScaleTestLinearBase {
  public:
   // Sets the number of padding bytes at the end of rows.
-  ScaleTestLinearBase<ElementType>& with_padding(size_t padding) {
+  ScaleTestLinearBase& with_padding(size_t padding) {
     padding_ = padding;
     return *this;
   }
@@ -73,42 +110,51 @@ class ScaleTestLinearBase {
   static constexpr ElementType max() {
     return std::numeric_limits<ElementType>::max();
   }
+  static constexpr ElementType lowest() {
+    return std::numeric_limits<ElementType>::lowest();
+  }
   virtual float scale() = 0;
   virtual float shift() = 0;
 
  private:
   class GenerateLinearSeries : public test::Generator<ElementType> {
    public:
-    explicit GenerateLinearSeries(ElementType start_from)
-        : counter_{start_from} {}
+    explicit GenerateLinearSeries(ElementType start_from, ElementType step)
+        : counter_{start_from}, step_{step} {}
 
-    std::optional<ElementType> next() override { return counter_++; }
+    std::optional<ElementType> next() override { return counter_ + step_; }
 
    private:
-    ElementType counter_;
+    ElementType counter_, step_;
   };  // end of class GenerateLinearSeries
 
   // Number of padding bytes at the end of rows.
   size_t padding_{0};
 
   void test_linear(size_t width, size_t minimum_size) {
-    size_t image_size =
-        std::max(minimum_size, static_cast<size_t>(max() - min()));
+    size_t image_size = std::max(
+        minimum_size,
+        std::min(saturating_cast<size_t, ElementType>(max() - lowest()),
+                 10000UL));
+    size_t step =
+        std::max(static_cast<ElementType>(image_size / (max() - lowest())),
+                 static_cast<ElementType>(1));
     size_t height = image_size / width + 1;
     test::Array2D<ElementType> source(width, height, padding_, 1);
     test::Array2D<ElementType> expected(width, height, padding_, 1);
     test::Array2D<ElementType> actual =
         test::Array2D<ElementType>(width, height, padding_, 1);
 
-    GenerateLinearSeries generator(min());
+    GenerateLinearSeries generator(min(), step);
 
     source.fill(generator);
 
     calculate_expected(source, expected);
 
-    ASSERT_EQ(KLEIDICV_OK, kleidicv_scale_u8(source.data(), source.stride(),
-                                             actual.data(), actual.stride(),
-                                             width, height, scale(), shift()));
+    ASSERT_EQ(KLEIDICV_OK,
+              scale_api<ElementType>()(source.data(), source.stride(),
+                                       actual.data(), actual.stride(), width,
+                                       height, scale(), shift()));
 
     EXPECT_EQ_ARRAY2D(expected, actual);
   }
@@ -118,19 +164,9 @@ class ScaleTestLinearBase {
                           test::Array2D<ElementType>& expected) {
     for (size_t hindex = 0; hindex < source.height(); ++hindex) {
       for (size_t vindex = 0; vindex < source.width(); ++vindex) {
-        ElementType calculated = 0;
-        // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult)
-        float result = *source.at(hindex, vindex) * scale() + shift();
-        // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult)
         // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign)
-        if (result > max()) {
-          calculated = max();
-        } else if (result < min()) {
-          calculated = min();
-        } else {
-          calculated = lrintf(result);
-        }
-        *expected.at(hindex, vindex) = calculated;
+        *expected.at(hindex, vindex) = scale_operation<ElementType>()(
+            *source.at(hindex, vindex), scale(), shift());
         // NOLINTEND(clang-analyzer-core.uninitialized.Assign)
       }
     }
@@ -159,16 +195,17 @@ template <typename ElementType>
 class ScaleTestAdd final : public ScaleTestBase<ElementType> {
   using Elements = typename UnaryOperationTest<ElementType>::Elements;
 
-  float scale() override { return 6; }
-  float shift() override { return 2; }
+  float scale() override { return 1; }
+  float shift() override { return 6; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      { 8, 50},
-      {12, 74},
+      { 8, 14},
+      {12, 18},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -181,12 +218,13 @@ class ScaleTestSubtract final : public ScaleTestBase<ElementType> {
   float shift() override { return -3; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
       { 6,  45},
       { 20, 157},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -199,12 +237,13 @@ class ScaleTestDivide final : public ScaleTestBase<ElementType> {
   float shift() override { return 3; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      { 252, 66},
-      { 255, 67},
+      { 252, 0},
+      { 255, 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -217,12 +256,13 @@ class ScaleTestMultiply final : public ScaleTestBase<ElementType> {
   float shift() override { return 2.72; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      { 60, 191},
-      { 75, 238},
+      { 60, 0},
+      { 45, 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -232,18 +272,21 @@ class ScaleTestZero final : public ScaleTestBase<ElementType> {
   using Elements = typename UnaryOperationTest<ElementType>::Elements;
   using UnaryOperationTest<ElementType>::min;
   using UnaryOperationTest<ElementType>::max;
+  using UnaryOperationTest<ElementType>::lowest;
 
   float scale() override { return 0; }
   float shift() override { return 0; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
+      { lowest(), 0},
       { min(), 0},
       {     0, 0},
       { max(), 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -252,18 +295,24 @@ template <typename ElementType>
 class ScaleTestUnderflowByShift final : public ScaleTestBase<ElementType> {
   using Elements = typename UnaryOperationTest<ElementType>::Elements;
   using UnaryOperationTest<ElementType>::min;
-  using UnaryOperationTest<ElementType>::max;
+  using UnaryOperationTest<ElementType>::lowest;
 
   float scale() override { return 1; }
   float shift() override { return -1; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      {min() + 1, min()},
-      {    min(), min()},
+      {lowest() + 1, 0},
+      {    lowest(), 0},
+      {   min() + 1, 0},
+      {       min(), 0},
+      {           0, 0},
+      {           1, 0},
+      {           2, 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -277,12 +326,13 @@ class ScaleTestOverflowByShift final : public ScaleTestBase<ElementType> {
   float shift() override { return 1; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      {max() - 1, max()},
-      {    max(), max()},
+      {max() - 1, 0},
+      {    max(), 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -297,11 +347,12 @@ class ScaleTestUnderflowByScale final : public ScaleTestBase<ElementType> {
   float shift() override { return 0; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      { max(), min()}
+      { max(), 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -315,11 +366,12 @@ class ScaleTestOverflowByScale final : public ScaleTestBase<ElementType> {
   float shift() override { return 0; }
 
   const std::vector<Elements>& test_elements() override {
-    static const std::vector<Elements> kTestElements = {
+    static std::vector<Elements> kTestElements = {
         // clang-format off
-      { max(), max()},
+      { max(), 0},
         // clang-format on
     };
+    ScaleTestBase<ElementType>::fill_expected(kTestElements);
     return kTestElements;
   }
 };
@@ -327,9 +379,8 @@ class ScaleTestOverflowByScale final : public ScaleTestBase<ElementType> {
 template <typename ElementType>
 class ScaleTest : public testing::Test {};
 
-using ElementTypes = ::testing::Types<uint8_t>;
+using ElementTypes = ::testing::Types<uint8_t, float>;
 
-// Tests kleidicv_scale_u8 API.
 TYPED_TEST_SUITE(ScaleTest, ElementTypes);
 
 TYPED_TEST(ScaleTest, TestScalar1) {
@@ -378,13 +429,37 @@ TYPED_TEST(ScaleTest, TestVector3Tbx) {
   ScaleTestLinear3<TypeParam>{}.test_vector(2500);
 }
 
-TYPED_TEST(ScaleTest, TestAdd) { ScaleTestAdd<TypeParam>{}.test(); }
+TYPED_TEST(ScaleTest, TestAdd) {
+  ScaleTestAdd<TypeParam>{}.test();
+  ScaleTestAdd<TypeParam>{}
+      .with_padding(1)
+      .with_width(test::Options::vector_lanes<TypeParam>() - 1)
+      .test();
+}
 
-TYPED_TEST(ScaleTest, TestSubtract) { ScaleTestSubtract<TypeParam>{}.test(); }
+TYPED_TEST(ScaleTest, TestSubtract) {
+  ScaleTestSubtract<TypeParam>{}.test();
+  ScaleTestSubtract<TypeParam>{}
+      .with_padding(1)
+      .with_width(test::Options::vector_lanes<TypeParam>() - 1)
+      .test();
+}
 
-TYPED_TEST(ScaleTest, TestDivide) { ScaleTestDivide<TypeParam>{}.test(); }
+TYPED_TEST(ScaleTest, TestDivide) {
+  ScaleTestDivide<TypeParam>{}.test();
+  ScaleTestDivide<TypeParam>{}
+      .with_padding(1)
+      .with_width(test::Options::vector_lanes<TypeParam>() - 1)
+      .test();
+}
 
-TYPED_TEST(ScaleTest, TestMultiply) { ScaleTestMultiply<TypeParam>{}.test(); }
+TYPED_TEST(ScaleTest, TestMultiply) {
+  ScaleTestMultiply<TypeParam>{}.test();
+  ScaleTestMultiply<TypeParam>{}
+      .with_padding(1)
+      .with_width(test::Options::vector_lanes<TypeParam>() - 1)
+      .test();
+}
 
 TYPED_TEST(ScaleTest, TestZero) { ScaleTestZero<TypeParam>{}.test(); }
 
@@ -406,7 +481,7 @@ TYPED_TEST(ScaleTest, TestOverflowByScale) {
 
 TYPED_TEST(ScaleTest, NullPointer) {
   TypeParam src[1] = {}, dst[1];
-  test::test_null_args(scale<TypeParam>(), src, sizeof(TypeParam), dst,
+  test::test_null_args(scale_api<TypeParam>(), src, sizeof(TypeParam), dst,
                        sizeof(TypeParam), 1, 1, 2, 0);
 }
 
@@ -417,28 +492,29 @@ TYPED_TEST(ScaleTest, Misalignment) {
   }
   TypeParam src[2] = {}, dst[2];
   EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT,
-            scale<TypeParam>()(src, sizeof(TypeParam) + 1, dst,
-                               sizeof(TypeParam), 1, 2, 2, 0));
+            scale_api<TypeParam>()(src, sizeof(TypeParam) + 1, dst,
+                                   sizeof(TypeParam), 1, 2, 2, 0));
   EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT,
-            scale<TypeParam>()(src, sizeof(TypeParam), dst,
-                               sizeof(TypeParam) + 1, 1, 2, 2, 0));
+            scale_api<TypeParam>()(src, sizeof(TypeParam), dst,
+                                   sizeof(TypeParam) + 1, 1, 2, 2, 0));
 }
 
 TYPED_TEST(ScaleTest, ZeroImageSize) {
   TypeParam src[1] = {}, dst[1];
-  EXPECT_EQ(KLEIDICV_OK, scale<TypeParam>()(src, sizeof(TypeParam), dst,
-                                            sizeof(TypeParam), 0, 1, 2, 0));
-  EXPECT_EQ(KLEIDICV_OK, scale<TypeParam>()(src, sizeof(TypeParam), dst,
-                                            sizeof(TypeParam), 1, 0, 2, 0));
+  EXPECT_EQ(KLEIDICV_OK, scale_api<TypeParam>()(src, sizeof(TypeParam), dst,
+                                                sizeof(TypeParam), 0, 1, 2, 0));
+  EXPECT_EQ(KLEIDICV_OK, scale_api<TypeParam>()(src, sizeof(TypeParam), dst,
+                                                sizeof(TypeParam), 1, 0, 2, 0));
 }
 
 TYPED_TEST(ScaleTest, OversizeImage) {
   TypeParam src[1] = {}, dst[1];
+  EXPECT_EQ(
+      KLEIDICV_ERROR_RANGE,
+      scale_api<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
+                             KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 2, 0));
   EXPECT_EQ(KLEIDICV_ERROR_RANGE,
-            scale<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
-                               KLEIDICV_MAX_IMAGE_PIXELS + 1, 1, 2, 0));
-  EXPECT_EQ(KLEIDICV_ERROR_RANGE,
-            scale<TypeParam>()(src, sizeof(TypeParam), dst, sizeof(TypeParam),
-                               KLEIDICV_MAX_IMAGE_PIXELS,
-                               KLEIDICV_MAX_IMAGE_PIXELS, 2, 0));
+            scale_api<TypeParam>()(src, sizeof(TypeParam), dst,
+                                   sizeof(TypeParam), KLEIDICV_MAX_IMAGE_PIXELS,
+                                   KLEIDICV_MAX_IMAGE_PIXELS, 2, 0));
 }
diff --git a/test/framework/operation.h b/test/framework/operation.h
index f8e78f90f1cb4765f4ab11deb37257cf21b63e57..2cc59e8f8bd0d4e790c7773b670992c3d734ff4b 100644
--- a/test/framework/operation.h
+++ b/test/framework/operation.h
@@ -142,6 +142,11 @@ class OperationTest {
     return std::numeric_limits<ElementType>::min();
   }
 
+  // Returns the lowest value for ElementType.
+  static constexpr ElementType lowest() {
+    return std::numeric_limits<ElementType>::lowest();
+  }
+
   // Returns the maximum value for ElementType.
   static constexpr ElementType max() {
     return std::numeric_limits<ElementType>::max();