From 481310508d3ba57bb8a2e51b305ea070a9e58f35 Mon Sep 17 00:00:00 2001
From: Denes Tarjan <denes.tarjan@arm.com>
Date: Thu, 29 May 2025 13:12:38 +0000
Subject: [PATCH] Fix ScaleFloat in case of in-place operation

NEON variant only. Fixes #9. Root cause:
Due to TryToAvoidTailLoop, when width is bigger than 4 and
is not divisible with 4, it scaled some elements twice.
---
 conformity/opencv/test_scale.cpp        | 48 ++++++++++--------
 kleidicv/src/arithmetics/scale_neon.cpp |  8 +--
 test/api/test_scale.cpp                 | 65 +++++++++++++++++--------
 3 files changed, 77 insertions(+), 44 deletions(-)
diff --git a/conformity/opencv/test_scale.cpp b/conformity/opencv/test_scale.cpp
index bc1614a17..a6df11b6a 100644
--- a/conformity/opencv/test_scale.cpp
+++ b/conformity/opencv/test_scale.cpp
@@ -1,23 +1,23 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <limits>
-#include <type_traits>
-#include <vector>
-
 #include "opencv2/core/hal/interface.h"
-#include "tests.h"
+#include "utils.h"
 
-template <int Scale, int Shift>
+template <int Scale, int Shift, bool InPlace>
 cv::Mat exec_scale(cv::Mat& input_mat) {
+  if constexpr (InPlace) {
+    input_mat.convertTo(input_mat, -1, Scale / 1000.0, Shift / 1000.0);
+    return input_mat;
+  }
   cv::Mat result;
   input_mat.convertTo(result, -1, Scale / 1000.0, Shift / 1000.0);
   return result;
 }
 
 #if MANAGER
-template <int Scale, int Shift, size_t Format>
+template <int Scale, int Shift, size_t Format, bool InPlace>
 bool test_scale(int index, RecreatedMessageQueue& request_queue,
                 RecreatedMessageQueue& reply_queue) {
   cv::RNG rng(0);
@@ -26,9 +26,9 @@ bool test_scale(int index, RecreatedMessageQueue& request_queue,
     for (size_t y = 5; y <= 16; ++y) {
       cv::Mat input_mat(x, y, Format);
       rng.fill(input_mat, cv::RNG::NORMAL, 0.0, 1.0e10);
-      cv::Mat actual_mat = exec_scale<Scale, Shift>(input_mat);
       cv::Mat expected_mat = get_expected_from_subordinate(
           index, request_queue, reply_queue, input_mat);
+      cv::Mat actual_mat = exec_scale<Scale, Shift, InPlace>(input_mat);
 
       bool success =
           (CV_MAT_DEPTH(Format) == CV_32F &&
@@ -50,17 +50,27 @@ bool test_scale(int index, RecreatedMessageQueue& request_queue,
 std::vector<test>& scale_tests_get() {
   // clang-format off
   static std::vector<test> tests = {
-    TEST("Scale float32, scale=1.0, shift=2.0", (test_scale<1000, 2000, CV_32FC4>), (exec_scale<1000, 2000>)),
-    TEST("Scale float32, scale=-10.0, shift=0.0", (test_scale<(-10000), 0, CV_32FC1>), (exec_scale<(-10000), 0>)),
-    TEST("Scale float32, scale=3.14, shift=2.72", (test_scale<3140, 2720, CV_32FC2>), (exec_scale<3140, 2720>)),
-    TEST("Scale float32, scale=7e5, shift=8e-3", (test_scale<700000000, 8, CV_32FC3>), (exec_scale<700000000, 8>)),
-    TEST("Scale float32, scale=1e-3, shift=8e-3", (test_scale<1, 8, CV_32FC4>), (exec_scale<1, 8>)),
+    TEST("Scale float32, scale=1.0, shift=2.0", (test_scale<1000, 2000, CV_32FC4, false>), (exec_scale<1000, 2000, false>)),
+    TEST("Scale float32, scale=-10.0, shift=0.0", (test_scale<(-10000), 0, CV_32FC1, false>), (exec_scale<(-10000), 0, false>)),
+    TEST("Scale float32, scale=3.14, shift=2.72", (test_scale<3140, 2720, CV_32FC2, false>), (exec_scale<3140, 2720, false>)),
+    TEST("Scale float32, scale=7e5, shift=8e-3", (test_scale<700000000, 8, CV_32FC3, false>), (exec_scale<700000000, 8, false>)),
+    TEST("Scale float32, scale=1e-3, shift=8e-3", (test_scale<1, 8, CV_32FC4, false>), (exec_scale<1, 8, false>)),
+    TEST("Scale float32, scale=1.0, shift=2.0, in-place", (test_scale<1000, 2000, CV_32FC4, true>), (exec_scale<1000, 2000, true>)),
+    TEST("Scale float32, scale=-10.0, shift=0.0, in-place", (test_scale<(-10000), 0, CV_32FC1, true>), (exec_scale<(-10000), 0, true>)),
+    TEST("Scale float32, scale=3.14, shift=2.72, in-place", (test_scale<3140, 2720, CV_32FC2, true>), (exec_scale<3140, 2720, true>)),
+    TEST("Scale float32, scale=7e5, shift=8e-3, in-place", (test_scale<700000000, 8, CV_32FC3, true>), (exec_scale<700000000, 8, true>)),
+    TEST("Scale float32, scale=1e-3, shift=8e-3, in-place", (test_scale<1, 8, CV_32FC4, true>), (exec_scale<1, 8, true>)),
 
-    TEST("Scale uint8, scale=1.0, shift=2.0", (test_scale<1000, 2000, CV_8UC4>), (exec_scale<1000, 2000>)),
-    TEST("Scale uint8, scale=-10.0, shift=0.0", (test_scale<(-10000), 0, CV_8UC1>), (exec_scale<(-10000), 0>)),
-    TEST("Scale uint8, scale=3.14, shift=-2.72", (test_scale<3140, -2720, CV_8UC2>), (exec_scale<3140, -2720>)),
-    TEST("Scale uint8, scale=17.17, shift=3.9", (test_scale<17170, 3900, CV_8UC3>), (exec_scale<17170, 3900>)),
-    TEST("Scale uint8, scale=0.13, shift=230", (test_scale<130, 230000, CV_8UC4>), (exec_scale<130, 230000>)),
+    TEST("Scale uint8, scale=1.0, shift=2.0", (test_scale<1000, 2000, CV_8UC4, false>), (exec_scale<1000, 2000, false>)),
+    TEST("Scale uint8, scale=-10.0, shift=0.0", (test_scale<(-10000), 0, CV_8UC1, false>), (exec_scale<(-10000), 0, false>)),
+    TEST("Scale uint8, scale=3.14, shift=-2.72", (test_scale<3140, -2720, CV_8UC2, false>), (exec_scale<3140, -2720, false>)),
+    TEST("Scale uint8, scale=17.17, shift=3.9", (test_scale<17170, 3900, CV_8UC3, false>), (exec_scale<17170, 3900, false>)),
+    TEST("Scale uint8, scale=0.13, shift=230", (test_scale<130, 230000, CV_8UC4, false>), (exec_scale<130, 230000, false>)),
+    TEST("Scale uint8, scale=1.0, shift=2.0, in-place", (test_scale<1000, 2000, CV_8UC4, true>), (exec_scale<1000, 2000, true>)),
+    TEST("Scale uint8, scale=-10.0, shift=0.0, in-place", (test_scale<(-10000), 0, CV_8UC1, true>), (exec_scale<(-10000), 0, true>)),
+    TEST("Scale uint8, scale=3.14, shift=-2.72, in-place", (test_scale<3140, -2720, CV_8UC2, true>), (exec_scale<3140, -2720, true>)),
+    TEST("Scale uint8, scale=17.17, shift=3.9, in-place", (test_scale<17170, 3900, CV_8UC3, true>), (exec_scale<17170, 3900, true>)),
+    TEST("Scale uint8, scale=0.13, shift=230, in-place", (test_scale<130, 230000, CV_8UC4, true>), (exec_scale<130, 230000, true>)),
   };
   // clang-format on
   return tests;
diff --git a/kleidicv/src/arithmetics/scale_neon.cpp b/kleidicv/src/arithmetics/scale_neon.cpp
index 3680259ce..a2808a117 100644
--- a/kleidicv/src/arithmetics/scale_neon.cpp
+++ b/kleidicv/src/arithmetics/scale_neon.cpp
@@ -263,9 +263,7 @@ std::array<uint8_t, 256> precalculate_scale_table_u8(float scale, float shift) {
 // Float implementation
 // -----------------------------------------------------------------------
 
-class AddFloat final : public UnrollTwice,
-                       public UnrollOnce,
-                       public TryToAvoidTailLoop {
+class AddFloat final : public UnrollTwice, public UnrollOnce {
  public:
   using ScalarType = float;
   using VecTraits = neon::VecTraits<ScalarType>;
@@ -284,9 +282,7 @@ class AddFloat final : public UnrollTwice,
   float32x4_t vshift_;
 };  // end of class AddFloat
 
-class ScaleFloat final : public UnrollTwice,
-                         public UnrollOnce,
-                         public TryToAvoidTailLoop {
+class ScaleFloat final : public UnrollTwice, public UnrollOnce {
  public:
   using ScalarType = float;
   using VecTraits = neon::VecTraits<ScalarType>;
diff --git a/test/api/test_scale.cpp b/test/api/test_scale.cpp
index 0d404cfe5..698b9f4b9 100644
--- a/test/api/test_scale.cpp
+++ b/test/api/test_scale.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -93,14 +93,14 @@ class ScaleTestLinearBase {
   }
 
   // minimum_size set by caller to trigger the 'big' scale path.
-  void test_scalar(size_t minimum_size = 1) {
+  void test_scalar(size_t minimum_size = 1, bool in_place = false) {
     size_t width = test::Options::vector_length() - 1;
-    test_linear(width, minimum_size);
+    test_linear(width, minimum_size, in_place);
   }
 
-  void test_vector(size_t minimum_size = 1) {
+  void test_vector(size_t minimum_size = 1, bool in_place = false) {
     size_t width = test::Options::vector_length() * 2;
-    test_linear(width, minimum_size);
+    test_linear(width, minimum_size, in_place);
   }
 
  protected:
@@ -132,7 +132,7 @@ class ScaleTestLinearBase {
   // Number of padding bytes at the end of rows.
   size_t padding_{0};
 
-  void test_linear(size_t width, size_t minimum_size) {
+  void test_linear(size_t width, size_t minimum_size, bool in_place) {
     size_t image_size = std::max(
         minimum_size,
         std::min(saturating_cast<size_t, ElementType>(max() - lowest()),
@@ -153,11 +153,19 @@ class ScaleTestLinearBase {
 
     calculate_expected(source, expected);
 
-    ASSERT_EQ(KLEIDICV_OK,
-              scale_api<ElementType>()(source.data(), source.stride(),
-                                       actual.data(), actual.stride(), width,
-                                       height, scale(), shift()));
-
+    if (in_place) {
+      actual = source;
+      ASSERT_EQ(KLEIDICV_OK,
+                scale_api<ElementType>()(actual.data(), actual.stride(),
+                                         actual.data(), actual.stride(), width,
+                                         height, scale(), shift()));
+
+    } else {
+      ASSERT_EQ(KLEIDICV_OK,
+                scale_api<ElementType>()(source.data(), source.stride(),
+                                         actual.data(), actual.stride(), width,
+                                         height, scale(), shift()));
+    }
     EXPECT_EQ_ARRAY2D(expected, actual);
   }
 
@@ -194,6 +202,12 @@ class ScaleTestLinear3 final : public ScaleTestLinearBase<ElementType> {
   float shift() override { return 1.41; };
 };
 
+template <typename ElementType>
+class ScaleTestLinear4 final : public ScaleTestLinearBase<ElementType> {
+  float scale() override { return 1.0; };
+  float shift() override { return -17.7; };
+};
+
 template <typename ElementType>
 class ScaleTestAdd final : public ScaleTestBase<ElementType> {
   using Elements = typename UnaryOperationTest<ElementType>::Elements;
@@ -397,12 +411,12 @@ TYPED_TEST(ScaleTest, TestVector1) {
 }
 
 TYPED_TEST(ScaleTest, TestScalar1Tbx) {
-  ScaleTestLinear1<TypeParam>{}.test_scalar(2500);
-  ScaleTestLinear1<TypeParam>{}.with_padding(1).test_scalar(2500);
+  ScaleTestLinear1<TypeParam>{}.test_scalar(700);
+  ScaleTestLinear1<TypeParam>{}.with_padding(1).test_scalar(700);
 }
 TYPED_TEST(ScaleTest, TestVector1Tbx) {
-  ScaleTestLinear1<TypeParam>{}.test_vector(2500);
-  ScaleTestLinear1<TypeParam>{}.with_padding(1).test_vector(2500);
+  ScaleTestLinear1<TypeParam>{}.test_vector(700);
+  ScaleTestLinear1<TypeParam>{}.with_padding(1).test_vector(700);
 }
 
 TYPED_TEST(ScaleTest, TestScalar2) {
@@ -413,10 +427,10 @@ TYPED_TEST(ScaleTest, TestVector2) {
 }
 
 TYPED_TEST(ScaleTest, TestScalar2Tbx) {
-  ScaleTestLinear2<TypeParam>{}.test_scalar(2500);
+  ScaleTestLinear2<TypeParam>{}.test_scalar(700);
 }
 TYPED_TEST(ScaleTest, TestVector2Tbx) {
-  ScaleTestLinear2<TypeParam>{}.test_vector(2500);
+  ScaleTestLinear2<TypeParam>{}.test_vector(700);
 }
 
 TYPED_TEST(ScaleTest, TestScalar3) {
@@ -427,10 +441,23 @@ TYPED_TEST(ScaleTest, TestVector3) {
 }
 
 TYPED_TEST(ScaleTest, TestScalar3Tbx) {
-  ScaleTestLinear3<TypeParam>{}.test_scalar(2500);
+  ScaleTestLinear3<TypeParam>{}.test_scalar(700);
 }
 TYPED_TEST(ScaleTest, TestVector3Tbx) {
-  ScaleTestLinear3<TypeParam>{}.test_vector(2500);
+  ScaleTestLinear3<TypeParam>{}.test_vector(700);
+}
+
+TYPED_TEST(ScaleTest, InPlaceScalar2) {
+  ScaleTestLinear2<TypeParam>{}.test_scalar(1, true);
+}
+TYPED_TEST(ScaleTest, InPlaceVector2) {
+  ScaleTestLinear2<TypeParam>{}.test_vector(1, true);
+}
+TYPED_TEST(ScaleTest, InPlaceAddScalar) {
+  ScaleTestLinear4<TypeParam>{}.test_scalar(1, true);
+}
+TYPED_TEST(ScaleTest, InPlaceAddVector) {
+  ScaleTestLinear4<TypeParam>{}.test_vector(1, true);
 }
 
 TYPED_TEST(ScaleTest, TestAdd) {
-- 
GitLab