From c14d5e1e5be293013b4f8b0bbbcc718fed9c6f27 Mon Sep 17 00:00:00 2001
From: Michael Platings <michael.platings@arm.com>
Date: Fri, 14 Jun 2024 14:55:04 +0100
Subject: [PATCH] Enable and, add, sub, mul, absdiff in OpenCV HAL

Experiments have shown that although there's little scope for optimizing
such simple operations, the KleidiCV implementations are marginally
faster than OpenCV.

Don't enable 32-bit operations in HAL because OpenCV requires that these
do not saturate, unlike the KleidiCV implementations.
---
 CHANGELOG.md                         |  1 +
 adapters/opencv/kleidicv_hal.h       | 14 --------------
 conformity/opencv/test_binary_op.cpp |  5 +++++
 doc/opencv.md                        | 20 ++++++++++++++++++++
 4 files changed, 26 insertions(+), 14 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 07f0f59a7..f8d9b9d5f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ This changelog aims to follow the guiding principles of
 - Bitwise and.
 - Gaussian Blur for 7x7 kernels.
 - Scale function for float.
+- Add, subtract, multiply & absdiff enabled in OpenCV HAL.
 
 ### Fixed
 
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 1751b73c0..5878c0115 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -346,8 +346,6 @@ static inline int kleidicv_compare_u8_with_fallback(
 #undef cv_hal_cmp8u
 #define cv_hal_cmp8u kleidicv_compare_u8_with_fallback
 
-#if KLEIDICV_ENABLE_ALL_OPENCV_HAL
-
 // clang-format off
 #undef cv_hal_add8s
 #define cv_hal_add8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s8, __VA_ARGS__)
@@ -357,8 +355,6 @@ static inline int kleidicv_compare_u8_with_fallback(
 #define cv_hal_add16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s16, __VA_ARGS__)
 #undef cv_hal_add16u
 #define cv_hal_add16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_u16, __VA_ARGS__)
-#undef cv_hal_add32s
-#define cv_hal_add32s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_add_s32, __VA_ARGS__)
 
 #undef cv_hal_sub8s
 #define cv_hal_sub8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s8, __VA_ARGS__)
@@ -368,8 +364,6 @@ static inline int kleidicv_compare_u8_with_fallback(
 #define cv_hal_sub16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s16, __VA_ARGS__)
 #undef cv_hal_sub16u
 #define cv_hal_sub16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_u16, __VA_ARGS__)
-#undef cv_hal_sub32s
-#define cv_hal_sub32s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_sub_s32, __VA_ARGS__)
 
 #undef cv_hal_absdiff8s
 #define cv_hal_absdiff8s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s8, __VA_ARGS__)
@@ -379,8 +373,6 @@ static inline int kleidicv_compare_u8_with_fallback(
 #define cv_hal_absdiff16s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s16, __VA_ARGS__)
 #undef cv_hal_absdiff16u
 #define cv_hal_absdiff16u(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_u16, __VA_ARGS__)
-#undef cv_hal_absdiff32s
-#define cv_hal_absdiff32s(...) KLEIDICV_HAL_FORWARD(kleidicv_saturating_absdiff_s32, __VA_ARGS__)
 
 #undef cv_hal_and8u
 #define cv_hal_and8u(...) KLEIDICV_HAL_FORWARD(kleidicv_bitwise_and, __VA_ARGS__)
@@ -416,12 +408,6 @@ KLEIDICV_HAL_MUL(mul16s, kleidicv_saturating_multiply_s16, int16_t);
 #undef cv_hal_mul16s
 #define cv_hal_mul16s kleidicv_mul16s_with_fallback
 
-KLEIDICV_HAL_MUL(mul32s, kleidicv_saturating_multiply_s32, int32_t);
-#undef cv_hal_mul32s
-#define cv_hal_mul32s kleidicv_mul32s_with_fallback
-
-#endif  // KLEIDICV_ENABLE_ALL_OPENCV_HAL
-
 #endif  // OPENCV_CORE_HAL_REPLACEMENT_HPP
 
 // Remove no longer needed macro definitions.
diff --git a/conformity/opencv/test_binary_op.cpp b/conformity/opencv/test_binary_op.cpp
index cc772012c..b5e0c297b 100644
--- a/conformity/opencv/test_binary_op.cpp
+++ b/conformity/opencv/test_binary_op.cpp
@@ -81,9 +81,14 @@ static bool test_binary_op(int index, RecreatedMessageQueue& request_queue,
 std::vector<test>& binary_op_tests_get() {
   static std::vector<test> tests = {
       BINARY_OP_TEST(add, CV_8SC1, int8_t),
+      BINARY_OP_TEST(add, CV_32SC1, int32_t),
       BINARY_OP_TEST(sub, CV_8UC2, uint8_t),
+      BINARY_OP_TEST(sub, CV_32SC1, int32_t),
+      BINARY_OP_TEST(mul, CV_8SC2, int8_t),
       BINARY_OP_TEST(mul, CV_16UC3, uint16_t),
+      BINARY_OP_TEST(mul, CV_32SC1, int32_t),
       BINARY_OP_TEST(absdiff, CV_16SC4, int16_t),
+      BINARY_OP_TEST(absdiff, CV_32SC1, int32_t),
       BINARY_OP_TEST(bitwise_and, CV_32SC2, int32_t),
       BINARY_OP_TEST(compare_equal, CV_8UC1, uint8_t),
       BINARY_OP_TEST(compare_greater, CV_8UC1, uint8_t),
diff --git a/doc/opencv.md b/doc/opencv.md
index a2047eea0..0257535bd 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -20,6 +20,26 @@ to decide whether to enable KleidiCV in a multicore environment.
 
 ## Functionality in KleidiCV OpenCV HAL
 
+### `add`, `subtract`, `absdiff`
+Element-wise addition, subtraction and absolute difference.
+
+Notes on parameters:
+* `depth` - `CV_8U`, `CV_8S`, `CV_16U` & `CV_16S` are supported.
+  `CV_32S` is not supported as KleidiCV does not currently provide the
+  non-saturating implementation required by OpenCV for this type.
+
+### `multiply`
+Element-wise multiplication.
+
+Notes on parameters:
+* `depth` - `CV_8U`, `CV_8S`, `CV_16U` & `CV_16S` are supported.
+  `CV_32S` is not supported as KleidiCV does not currently provide the
+  non-saturating implementation required by OpenCV for this type.
+* `scale` - only a value of 1.0 is supported.
+
+### `bitwise_and`
+Bitwise conjunction of two arrays.
+
 ### `gray_to_bgr`
 Converts grayscale images to RGB or RGBA.
 
-- 
GitLab