From def8b80bd8a8dce04b960f27889528bcc7fbd9f2 Mon Sep 17 00:00:00 2001
From: Michael Platings <michael.platings@arm.com>
Date: Tue, 5 Nov 2024 11:05:27 +0000
Subject: [PATCH] Implement OpenCV's LKOpticalFlowLevel HAL

---
 CHANGELOG.md                                  |   1 +
 CMakeLists.txt                                |   1 +
 LICENSES/BSD-3-Clause.txt                     |  11 +
 README.md                                     |   4 +
 adapters/opencv/CMakeLists.txt                |  52 +-
 adapters/opencv/kleidicv_hal.cpp              |  17 +-
 adapters/opencv/kleidicv_hal.h                |  29 +
 adapters/opencv/opencv-4.10.patch             |  59 +-
 benchmark/CMakeLists.txt                      |  14 +-
 benchmark/benchmark.cpp                       | 136 +++-
 conformity/opencv/test_optical_flow.cpp       |  93 +++
 conformity/opencv/tests.cpp                   |   1 +
 conformity/opencv/tests.h                     |   1 +
 doc/opencv.md                                 |   5 +
 .../kleidicv/containers/small_buffer.h        |  44 ++
 kleidicv_opencv/CMakeLists.txt                | 122 +++
 kleidicv_opencv/README.md                     |  12 +
 .../include/kleidicv_opencv/config.h          |  16 +
 .../include/kleidicv_opencv/kleidicv_opencv.h |  33 +
 .../include/kleidicv_opencv/optical_flow.h    |  53 ++
 .../src/optical_flow/optical_flow_api.cpp     |  12 +
 .../src/optical_flow/optical_flow_common.h    | 238 ++++++
 .../src/optical_flow/optical_flow_neon.cpp    | 608 +++++++++++++++
 .../src/optical_flow/optical_flow_sc.h        | 208 +++++
 .../src/optical_flow/optical_flow_sme2.cpp    |  54 ++
 .../src/optical_flow/optical_flow_sve2.cpp    |  30 +
 scripts/cpplint.sh                            |   2 +-
 scripts/format.sh                             |   1 +
 test/api/CMakeLists.txt                       |   2 +
 test/api/test_optical_flow.cpp                | 728 ++++++++++++++++++
 test/api/test_small_buffer.cpp                |  34 +
 31 files changed, 2572 insertions(+), 49 deletions(-)
 create mode 100644 LICENSES/BSD-3-Clause.txt
 create mode 100644 conformity/opencv/test_optical_flow.cpp
 create mode 100644 kleidicv/include/kleidicv/containers/small_buffer.h
 create mode 100644 kleidicv_opencv/CMakeLists.txt
 create mode 100644 kleidicv_opencv/README.md
 create mode 100644 kleidicv_opencv/include/kleidicv_opencv/config.h
 create mode 100644 kleidicv_opencv/include/kleidicv_opencv/kleidicv_opencv.h
 create mode 100644 kleidicv_opencv/include/kleidicv_opencv/optical_flow.h
 create mode 100644 kleidicv_opencv/src/optical_flow/optical_flow_api.cpp
 create mode 100644 kleidicv_opencv/src/optical_flow/optical_flow_common.h
 create mode 100644 kleidicv_opencv/src/optical_flow/optical_flow_neon.cpp
 create mode 100644 kleidicv_opencv/src/optical_flow/optical_flow_sc.h
 create mode 100644 kleidicv_opencv/src/optical_flow/optical_flow_sme2.cpp
 create mode 100644 kleidicv_opencv/src/optical_flow/optical_flow_sve2.cpp
 create mode 100644 test/api/test_optical_flow.cpp
 create mode 100644 test/api/test_small_buffer.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad96463c..e47f29fac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ This changelog aims to follow the guiding principles of
   - Fixed-point interpolation, for replicated borders with 1-channel u8 input.
 - WarpPerspective implementation
   - Nearest and Linear interpolation method, for 1-channel u8 input.
+- Implementation of cv::calcOpticalFlowPyrLK in the OpenCV HAL.
 
 ### Changed
 - Increased precision of sum for 32 bit floats and expose it to OpenCV HAL.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01208d679..2ca87eca4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ cmake_minimum_required(VERSION 3.16)
 project("KleidiCV" CXX)
 
 add_subdirectory(kleidicv)
+add_subdirectory(kleidicv_opencv)
 add_subdirectory(kleidicv_thread)
 add_subdirectory(test)
 add_subdirectory(benchmark)
diff --git a/LICENSES/BSD-3-Clause.txt b/LICENSES/BSD-3-Clause.txt
new file mode 100644
index 000000000..ea890afbc
--- /dev/null
+++ b/LICENSES/BSD-3-Clause.txt
@@ -0,0 +1,11 @@
+Copyright (c) <year> <owner>. 
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 7c620317b..ec3e69853 100644
--- a/README.md
+++ b/README.md
@@ -18,3 +18,7 @@ It is designed to be simple to integrate into a wide variety of projects.
 * [C API documentation](https://kleidi.sites.arm.com/kleidicv/)
 * [Benchmarking](doc/benchmark.md)
 * [Testing](doc/test.md)
+
+## Copyright and Licensing
+
+This project follows the [REUSE specification 3.0](https://reuse.software/spec-3.0/).
diff --git a/adapters/opencv/CMakeLists.txt b/adapters/opencv/CMakeLists.txt
index a53f754a0..8e451409e 100644
--- a/adapters/opencv/CMakeLists.txt
+++ b/adapters/opencv/CMakeLists.txt
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 include("${CMAKE_CURRENT_LIST_DIR}/../../kleidicv/CMakeLists.txt")
+include("${CMAKE_CURRENT_LIST_DIR}/../../kleidicv_opencv/CMakeLists.txt")
 include("${CMAKE_CURRENT_LIST_DIR}/../../kleidicv_thread/CMakeLists.txt")
 
 project("KleidiCV OpenCV HAL")
@@ -10,10 +11,10 @@ project("KleidiCV OpenCV HAL")
 set(KLEIDICV_HAL_VERSION "0.4.0" CACHE INTERNAL "")
 set(KLEIDICV_HAL_LIBRARIES "kleidicv_hal" CACHE INTERNAL "")
 set(KLEIDICV_HAL_HEADERS "${CMAKE_CURRENT_LIST_DIR}/kleidicv_hal.h" CACHE INTERNAL "")
-set(KLEIDICV_HAL_INCLUDE_DIRS "$<TARGET_PROPERTY:kleidicv_thread,INCLUDE_DIRECTORIES>" CACHE INTERNAL "")
+set(KLEIDICV_HAL_INCLUDE_DIRS "$<TARGET_PROPERTY:kleidicv_opencv,INCLUDE_DIRECTORIES>;$<TARGET_PROPERTY:kleidicv_thread,INCLUDE_DIRECTORIES>" CACHE INTERNAL "")
 
 add_library(kleidicv_hal STATIC "${CMAKE_CURRENT_LIST_DIR}/kleidicv_hal.cpp")
-target_link_libraries(kleidicv_hal PUBLIC kleidicv kleidicv_thread)
+target_link_libraries(kleidicv_hal PUBLIC kleidicv kleidicv_opencv kleidicv_thread)
 target_include_directories(kleidicv_hal PRIVATE
   ${KLEIDICV_HAL_INCLUDE_DIRS}
   ${CMAKE_CURRENT_LIST_DIR}
@@ -29,45 +30,28 @@ target_compile_options(
 
 set_target_properties(kleidicv_hal PROPERTIES CXX_STANDARD 17)
 
-if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(kleidicv_neon
+function(kleidicv_ocv_install_target name)
+  ocv_install_target(${name}
     EXPORT OpenCVModules
     ARCHIVE
     DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH}
     COMPONENT dev
   )
+endfunction()
+
+if(NOT BUILD_SHARED_LIBS)
+  kleidicv_ocv_install_target(kleidicv_neon)
+  kleidicv_ocv_install_target(kleidicv_opencv_neon)
   if(KLEIDICV_ENABLE_SVE2)
-    ocv_install_target(kleidicv_sve2
-      EXPORT OpenCVModules
-      ARCHIVE
-      DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH}
-      COMPONENT dev
-    )
+    kleidicv_ocv_install_target(kleidicv_sve2)
+    kleidicv_ocv_install_target(kleidicv_opencv_sve2)
   endif()
   if(KLEIDICV_ENABLE_SME2)
-    ocv_install_target(kleidicv_sme2
-      EXPORT OpenCVModules
-      ARCHIVE
-      DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH}
-      COMPONENT dev
-    )
+    kleidicv_ocv_install_target(kleidicv_sme2)
+    kleidicv_ocv_install_target(kleidicv_opencv_sme2)
   endif()
-  ocv_install_target(kleidicv
-    EXPORT OpenCVModules
-    ARCHIVE
-    DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH}
-    COMPONENT dev
-  )
-  ocv_install_target(kleidicv_thread
-    EXPORT OpenCVModules
-    ARCHIVE
-    DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH}
-    COMPONENT dev
-  )
-  ocv_install_target(kleidicv_hal
-    EXPORT OpenCVModules
-    ARCHIVE
-    DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH}
-    COMPONENT dev
-  )
+  kleidicv_ocv_install_target(kleidicv)
+  kleidicv_ocv_install_target(kleidicv_opencv)
+  kleidicv_ocv_install_target(kleidicv_thread)
+  kleidicv_ocv_install_target(kleidicv_hal)
 endif()
diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp
index 2fa3c244d..79791424e 100644
--- a/adapters/opencv/kleidicv_hal.cpp
+++ b/adapters/opencv/kleidicv_hal.cpp
@@ -14,7 +14,7 @@
 
 #include "kleidicv/filters/blur_and_downsample.h"
 #include "kleidicv/filters/gaussian_blur.h"
-#include "kleidicv/kleidicv.h"
+#include "kleidicv_opencv/kleidicv_opencv.h"
 #include "kleidicv_thread/kleidicv_thread.h"
 #include "opencv2/core/base.hpp"
 #include "opencv2/core/cvdef.h"
@@ -1306,6 +1306,21 @@ int inRange_f32(const uchar *src_data, size_t src_step, uchar *dst_data,
       static_cast<float>(lower_bound), static_cast<float>(upper_bound)));
 }
 
+int optical_flow_u8(const uchar *prev_data, size_t prev_data_step,
+                    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+                    const uchar *next_data, size_t next_step, int width,
+                    int height, int cn, const float *prev_points,
+                    float *next_points, size_t point_count, uchar *status,
+                    float *err, const int win_width, const int win_height,
+                    int termination_count, double termination_epsilon,
+                    bool get_min_eigen_vals, float min_eigen_vals_threshold) {
+  return convert_error(kleidicv_opencv_optical_flow_u8(
+      prev_data, prev_data_step, prev_deriv_data, prev_deriv_step, next_data,
+      next_step, width, height, cn, prev_points, next_points, point_count,
+      status, err, win_width, win_height, termination_count,
+      termination_epsilon, get_min_eigen_vals, min_eigen_vals_threshold));
+}
+
 int remap_s16(int src_type, const uchar *src_data, size_t src_step,
               int src_width, int src_height, uchar *dst_data, size_t dst_step,
               int dst_width, int dst_height, const int16_t *mapxy,
diff --git a/adapters/opencv/kleidicv_hal.h b/adapters/opencv/kleidicv_hal.h
index 2a5f04e31..5e2904bfc 100644
--- a/adapters/opencv/kleidicv_hal.h
+++ b/adapters/opencv/kleidicv_hal.h
@@ -146,6 +146,15 @@ int inRange_f32(const uchar *src_data, size_t src_step, uchar *dst_data,
                 size_t dst_step, int dst_depth, int width, int height, int cn,
                 double lower_bound, double upper_bound);
 
+int optical_flow_u8(const uchar *prev_data, size_t prev_data_step,
+                    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+                    const uchar *next_data, size_t next_step, int width,
+                    int height, int cn, const float *prev_points,
+                    float *next_points, size_t point_count, uchar *status,
+                    float *err, const int win_width, const int win_height,
+                    int termination_count, double termination_epsilon,
+                    bool get_min_eigen_vals, float min_eigen_vals_threshold);
+
 int remap_s16(int src_type, const uchar *src_data, size_t src_step,
               int src_width, int src_height, uchar *dst_data, size_t dst_step,
               int dst_width, int dst_height, const int16_t *mapxy,
@@ -632,6 +641,26 @@ static inline int kleidicv_in_range_f32_with_fallback(
 
 #ifdef OPENCV_VIDEO_HAL_REPLACEMENT_HPP
 
+#ifdef cv_hal_LKOpticalFlowLevel
+static inline int kleidicv_opencv_optical_flow_u8_with_fallback(
+    const uchar *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uchar *next_data, size_t next_step, int width, int height, int cn,
+    const float *prev_points, float *next_points, size_t point_count,
+    uchar *status, float *err, const int win_width, const int win_height,
+    int termination_count, double termination_epsilon, bool get_min_eigen_vals,
+    float min_eigen_vals_threshold) {
+  return KLEIDICV_HAL_FALLBACK_FORWARD(
+      optical_flow_u8, cv_hal_LKOpticalFlowLevel, prev_data, prev_data_step,
+      prev_deriv_data, prev_deriv_step, next_data, next_step, width, height, cn,
+      prev_points, next_points, point_count, status, err, win_width, win_height,
+      termination_count, termination_epsilon, get_min_eigen_vals,
+      min_eigen_vals_threshold);
+}
+#undef cv_hal_LKOpticalFlowLevel
+#define cv_hal_LKOpticalFlowLevel kleidicv_opencv_optical_flow_u8_with_fallback
+#endif  // cv_hal_LKOpticalFlowLevel
+
 // ScharrDeriv
 // This condition can be removed if this HAL macro is defined in all supported
 // versions
diff --git a/adapters/opencv/opencv-4.10.patch b/adapters/opencv/opencv-4.10.patch
index 85599737b..c2b70f7ef 100644
--- a/adapters/opencv/opencv-4.10.patch
+++ b/adapters/opencv/opencv-4.10.patch
@@ -233,10 +233,10 @@ index d0f50a73bb..1c308887dc 100644
          std::vector<ufixedpoint16> fkx, fky;
 diff --git a/modules/video/src/hal_replacement.hpp b/modules/video/src/hal_replacement.hpp
 new file mode 100644
-index 0000000000..e413dd3894
+index 0000000000..396fa9a2d5
 --- /dev/null
 +++ b/modules/video/src/hal_replacement.hpp
-@@ -0,0 +1,73 @@
+@@ -0,0 +1,126 @@
 +// This file is part of OpenCV project.
 +// It is subject to the license terms in the LICENSE file found in the top-level directory
 +// of this distribution and at http://opencv.org/license.html.
@@ -257,8 +257,61 @@ index 0000000000..e413dd3894
 +#pragma GCC diagnostic ignored "-Wunused-parameter"
 +#endif
 +
++//! @addtogroup video_hal_interface
++//! @note Define your functions to override default implementations:
++//! @code
++//! #undef cv_hal_LK_optical_flow_level
++//! #define cv_hal_LK_optical_flow_level my_hal_LK_optical_flow_level
++//! @endcode
++//! @{
++
++/**
++@brief Lucas-Kanade optical flow for single pyramid layer. See calcOpticalFlowPyrLK.
++@note OpenCV builds pyramid levels with `win_size` padding. Out-of-bound access to source
++image data is legal within `+-win_size` range.
++@param prev_data previous frame image data
++@param prev_data_step previous frame image data step
++@param prev_deriv_data previous frame Schaar derivatives
++@param prev_deriv_step previous frame Schaar derivatives step
++@param next_data next frame image data
++@param next_step next frame image step
++@param width input images width
++@param height input images height
++@param cn source image channels
++@param prev_points 2d points coordinates (x,y) on the previous frame
++@param next_points points coordinates (x,y) on the next frame
++@param point_count - amount of input points
++@param status optical flow status for each point. Optional output, expected if not nullptr is provided
++@param err optical flow estimation error for each point. Optional output, expected if not nullptr is provided
++@param win_width optical flow window width
++@param win_height optical flow window heigh
++@param termination_count maximum algorithm iterations. 0 means unlimited
++@param termination_epsilon maximal allowed algorithm error
++@param get_min_eigen_vals return minimal egen values as point errors in err buffer
++@param min_eigen_vals_threshold eigen values threshold
++**/
++inline int hal_ni_LKOpticalFlowLevel(const uchar *prev_data, size_t prev_data_step,
++                       const short* prev_deriv_data, size_t prev_deriv_step,
++                       const uchar* next_data, size_t next_step,
++                       int width, int height, int cn,
++                       const float *prev_points, float *next_points, size_t point_count,
++                       uchar *status, float *err,
++                       const int win_width, const int win_height,
++                       int termination_count, double termination_epsilon,
++                       bool get_min_eigen_vals,
++                       float min_eigen_vals_threshold)
++{
++    return CV_HAL_ERROR_NOT_IMPLEMENTED;
++}
++
++//! @cond IGNORED
++#define cv_hal_LKOpticalFlowLevel hal_ni_LKOpticalFlowLevel
++//! @endcond
++
 +/**
 +@brief Computes Schaar derivatives with inteleaved layout xyxy...
++@note OpenCV builds pyramid levels with `win_size` padding. Out-of-bound access to source
++image data is legal within `+-win_size` range.
 +@param src_data source image data
 +@param src_step source image step
 +@param dst_data destination buffer data
@@ -311,7 +364,7 @@ index 0000000000..e413dd3894
 +
 +#endif
 diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
-index 6d51c0cf1a..0e6f6a324e 100644
+index 6d51c0cf1a..43ec08509e 100644
 --- a/modules/video/src/lkpyramid.cpp
 +++ b/modules/video/src/lkpyramid.cpp
 @@ -50,6 +50,7 @@
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 5e79c0c4e..bf1b6177d 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -22,13 +22,6 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(benchmark)
 
-
-set(KLEIDICV_INCLUDE_DIR
-  ${CMAKE_CURRENT_SOURCE_DIR}/../kleidicv/include
-  ${CMAKE_CURRENT_BINARY_DIR}/../kleidicv/include
-)
-set(KLEIDICV_BENCHMARK_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-
 set(KLEIDICV_BENCHMARK_CXX_FLAGS
   "-Wall"
   "-Wextra"
@@ -60,13 +53,16 @@ set_target_properties(
 
 target_include_directories(
   kleidicv-benchmark
-  PRIVATE ${KLEIDICV_INCLUDE_DIR}
-  PRIVATE ${KLEIDICV_BENCHMARK_INCLUDE_DIR}
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/../kleidicv/include
+  ${CMAKE_CURRENT_BINARY_DIR}/../kleidicv/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/../kleidicv_opencv/include
 )
 
 target_link_libraries(
   kleidicv-benchmark
   kleidicv
+  kleidicv_opencv
   benchmark::benchmark
 )
 
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index 2bab77141..eadf583cc 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -8,7 +8,8 @@
 #include <cstdint>
 #include <random>
 
-#include "kleidicv/kleidicv.h"
+#include "kleidicv/filters/scharr.h"
+#include "kleidicv_opencv/kleidicv_opencv.h"
 
 // These variables can be set runtime, from command line
 extern size_t image_width, image_height;
@@ -463,6 +464,139 @@ static void in_range(Function f, T lower_bound, T upper_bound,
 BENCH_IN_RANGE(in_range_u8, in_range_u8, 1, 2, uint8_t);
 BENCH_IN_RANGE(in_range_f32, in_range_f32, 1.111, 1.112, float);
 
+// Like kleidicv_scharr_interleaved_s16_u8 but handles edge pixels.
+// Edge pixels are handled using KLEIDICV_BORDER_TYPE_REVERSE (or in OpenCV
+// terminology REFLECT_101).
+static void scharr_xy_border_reverse(const uint8_t* src, size_t src_stride,
+                                     int16_t* dst, size_t dst_stride,
+                                     size_t width, size_t height) {
+  for (size_t y = 0; y < height; y++) {
+    const uint8_t* src_0 = src;
+    if (y > 0) {
+      src_0 += src_stride * (y - 1);
+    } else if (height > 1) {
+      src_0 += src_stride;
+    }
+
+    const uint8_t* src_1 = src + src_stride * y;
+
+    const uint8_t* src_2 = src;
+    if (y + 1 < height) {
+      src_2 += src_stride * (y + 1);
+    } else if (height > 1) {
+      src_2 += src_stride * (height - 2);
+    }
+
+    int16_t* dst_row = dst + y * dst_stride / sizeof(int16_t);
+    for (size_t x = 0; x < width; ++x) {
+      size_t x0 = 0;
+      if (x > 0) {
+        x0 = x - 1;
+      } else if (width > 1) {
+        x0 = 1;
+      }
+
+      size_t x2 = 0;
+      if (x + 1 < width) {
+        x2 = x + 1;
+      } else if (width > 1) {
+        x2 = width - 2;
+      }
+      dst_row[x * 2] = static_cast<int16_t>(
+          (src_0[x2] + src_2[x2] - src_0[x0] - src_2[x0]) * 3 +
+          (src_1[x2] - src_1[x0]) * 10);
+      dst_row[x * 2 + 1] = static_cast<int16_t>(
+          (src_2[x0] + src_2[x2] - src_0[x0] - src_0[x2]) * 3 +
+          (src_2[x] - src_0[x]) * 10);
+    }
+  }
+}
+
+static void optical_flow_u8(benchmark::State& state) {
+  const size_t max_window_width = 40;
+  const size_t padded_image_width = image_width + max_window_width * 2;
+  const size_t padded_image_height = image_height + max_window_width * 2;
+  std::minstd_rand generator;
+
+  auto make_prev_image = [&]() {
+    std::vector<uint8_t> prev_image(padded_image_width * padded_image_height);
+    std::generate(prev_image.begin(), prev_image.end(), generator);
+    return prev_image;
+  };
+  static const std::vector<uint8_t> prev_image = make_prev_image();
+
+  auto make_next_image = [&]() {
+    std::vector<uint8_t> next_image(prev_image.size());
+    for (size_t i = 0; i != prev_image.size(); ++i) {
+      // Make the next image the same as the previous image but with some noise
+      // added.
+      next_image[i] = prev_image[i] + (generator() & 0xF);
+    }
+    return next_image;
+  };
+  static const std::vector<uint8_t> next_image = make_next_image();
+
+  auto make_deriv = [&]() {
+    std::vector<int16_t> deriv(prev_image.size() * 2);
+    scharr_xy_border_reverse(prev_image.data(), padded_image_width,
+                             deriv.data(),
+                             padded_image_width * 2 * sizeof(int16_t),
+                             padded_image_width, padded_image_height);
+    return deriv;
+  };
+  static const std::vector<int16_t> deriv = make_deriv();
+
+  const size_t point_count_x =
+      std::max<size_t>(1, image_width / max_window_width);
+  const size_t point_count_y =
+      std::max<size_t>(1, image_height / max_window_width);
+  const size_t point_count = point_count_x * point_count_y;
+
+  auto make_prev_points = [&]() {
+    std::vector<float> prev_points;
+    prev_points.reserve(point_count * 2);
+
+    // Generate regularly spaced but jittered points to sample
+    std::uniform_real_distribution<float> point_jitter{0.0F, 1.0F};
+    for (size_t j = 0; j != point_count_y; ++j) {
+      for (size_t i = 0; i != point_count_x; ++i) {
+        float x = (i + point_jitter(generator)) * image_width / point_count_x;
+        float y = (j + point_jitter(generator)) * image_height / point_count_y;
+        prev_points.push_back(x);
+        prev_points.push_back(y);
+      }
+    }
+    return prev_points;
+  };
+  static const std::vector<float> prev_points = make_prev_points();
+  std::vector<float> next_points(point_count * 2);
+
+  std::vector<uint8_t> status(point_count);
+
+  const size_t window_width = state.range(0);
+  const size_t window_height = window_width;
+
+  bench_functor(state, [&]() {
+    (void)kleidicv_opencv_optical_flow_u8(
+        prev_image.data() + padded_image_width * window_height + window_width,
+        padded_image_width,
+        deriv.data() + (padded_image_width * window_height + window_width) * 2,
+        padded_image_width * 2 * sizeof(int16_t),
+        next_image.data() + padded_image_width * window_height + window_width,
+        padded_image_width, image_width, image_height, 1 /*channels*/,
+        prev_points.data(), next_points.data(), point_count, status.data(),
+        nullptr /*err*/, window_width, window_height, 30 /*termination_count*/,
+        0.0001 /*termination_epsilon*/, true /*get_min_eigen_vals*/,
+        0.0001 /*min_eigen_vals_threshold*/);
+  });
+}
+BENCHMARK(optical_flow_u8)
+    ->ArgName("window_width")
+    ->Arg(7)
+    ->Arg(9)
+    ->Arg(11)
+    ->Arg(21);
+
 static void blur_and_downsample_u8(benchmark::State& state) {
   kleidicv_filter_context_t* context;
   kleidicv_error_t err = kleidicv_filter_context_create(
diff --git a/conformity/opencv/test_optical_flow.cpp b/conformity/opencv/test_optical_flow.cpp
new file mode 100644
index 000000000..0792a73d2
--- /dev/null
+++ b/conformity/opencv/test_optical_flow.cpp
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "opencv2/video.hpp"
+#include "tests.h"
+
+static cv::Mat exec_optical_flow(cv::Mat& input_mat) {
+  int mid = input_mat.rows / 2;
+  cv::Mat prevImg = input_mat.rowRange(0, mid);
+  cv::Mat nextImg = input_mat.rowRange(mid, input_mat.rows);
+
+  std::vector<cv::Point2f> prevPts, nextPts;
+  std::vector<uint8_t> status;
+  std::vector<float> err;
+
+  const size_t point_count_x = prevImg.cols / 30;
+  const size_t point_count_y = prevImg.rows / 30;
+
+  for (size_t j = 0; j != point_count_y; ++j) {
+    for (size_t i = 0; i != point_count_x; ++i) {
+      float x = i * prevImg.cols / point_count_x;
+      float y = j * prevImg.rows / point_count_y;
+      prevPts.push_back(cv::Point2f(x, y));
+    }
+  }
+
+  prevPts.push_back(cv::Point2f(0, 0));
+  prevPts.push_back(cv::Point2f(prevImg.cols - 0.5F, 0));
+  prevPts.push_back(cv::Point2f(0, prevImg.rows - 0.5F));
+  prevPts.push_back(cv::Point2f(prevImg.cols - 0.5F, prevImg.rows - 0.5F));
+
+  cv::calcOpticalFlowPyrLK(prevImg, nextImg, prevPts, nextPts, status, err);
+
+  cv::Mat result(0, 1, CV_32FC1);
+  for (const auto& p : nextPts) {
+    result.push_back(p.x);
+    result.push_back(p.y);
+  }
+  for (auto s : status) {
+    result.push_back(static_cast<float>(s));
+  }
+  for (auto e : err) {
+    result.push_back(e);
+  }
+
+  return result;
+}
+
+#if MANAGER
+bool test_optical_flow(int index, RecreatedMessageQueue& request_queue,
+                       RecreatedMessageQueue& reply_queue) {
+  cv::RNG rng(0);
+
+  const size_t width = 80;
+  const size_t height = 60;
+
+  cv::Mat input_mat(height * 2, width, CV_8UC1);
+  rng.fill(input_mat, cv::RNG::UNIFORM, 0, 255);
+
+  // Rotate top half of image, rotate it and scale it a little, and put that in
+  // the bottom half of the image.
+  cv::warpAffine(input_mat(cv::Rect(0, 0, width, height)),
+                 input_mat(cv::Rect(0, height, width, height)),
+                 cv::getRotationMatrix2D(cv::Point2f(), 5, 0.95F),
+                 cv::Size(width, height), cv::INTER_LINEAR,
+                 cv::BORDER_REFLECT_101);
+
+  cv::Mat actual_mat = exec_optical_flow(input_mat);
+  cv::Mat expected_mat = get_expected_from_subordinate(index, request_queue,
+                                                       reply_queue, input_mat);
+
+  if (are_float_matrices_different<float>(1e-5F, actual_mat, expected_mat)) {
+    fail_print_matrices(width, height, input_mat, actual_mat, expected_mat);
+    return true;
+  }
+
+  return false;
+}
+#endif
+
+std::vector<test>& optical_flow_tests_get() {
+  // clang-format off
+  static std::vector<test> tests = {
+    TEST("Optical flow", (test_optical_flow), exec_optical_flow),
+  };
+  // clang-format on
+  return tests;
+}
diff --git a/conformity/opencv/tests.cpp b/conformity/opencv/tests.cpp
index 614b66cb1..e0312b748 100644
--- a/conformity/opencv/tests.cpp
+++ b/conformity/opencv/tests.cpp
@@ -26,6 +26,7 @@ std::vector<test> all_tests = merge_tests({
     binary_op_tests_get,
     cvtcolor_tests_get,
     morphology_tests_get,
+    optical_flow_tests_get,
 #if KLEIDICV_ENABLE_ALL_OPENCV_HAL
     separable_filter_2d_tests_get,
 #endif
diff --git a/conformity/opencv/tests.h b/conformity/opencv/tests.h
index 6087026f2..f790aec79 100644
--- a/conformity/opencv/tests.h
+++ b/conformity/opencv/tests.h
@@ -12,6 +12,7 @@
 std::vector<test>& binary_op_tests_get();
 std::vector<test>& cvtcolor_tests_get();
 std::vector<test>& morphology_tests_get();
+std::vector<test>& optical_flow_tests_get();
 std::vector<test>& separable_filter_2d_tests_get();
 std::vector<test>& gaussian_blur_tests_get();
 std::vector<test>& rgb2yuv_tests_get();
diff --git a/doc/opencv.md b/doc/opencv.md
index 5bef02c0a..aa95b1534 100644
--- a/doc/opencv.md
+++ b/doc/opencv.md
@@ -239,5 +239,10 @@ Notes on parameters:
 * `src.depth()` - only supports `CV_8U` and 1 channel.
 * if `dstsize` is specified it must be equal to `Size((src.cols + 1) / 2, (src.rows + 1) / 2)`
 
+### [`cv::calcOpticalFlowPyrLK()`](https://docs.opencv.org/4.10.0/dc/d6b/group__video__track.html#ga473e4b886d0bcc6b65831eb88ed93323)
+Finds optical flow of points from one image to another.
+Notes on parameters:
+* `prevImg` - only supports 1 channel.
+
 ### [`cv::buildOpticalFlowPyramid()`](https://docs.opencv.org/4.10.0/dc/d6b/group__video__track.html#ga86640c1c470f87b2660c096d2b22b2ce)
 Constructs an image pyramid which can be passed to `cv::calcOpticalFlowPyrLK`.
diff --git a/kleidicv/include/kleidicv/containers/small_buffer.h b/kleidicv/include/kleidicv/containers/small_buffer.h
new file mode 100644
index 000000000..42f3d6bac
--- /dev/null
+++ b/kleidicv/include/kleidicv/containers/small_buffer.h
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_SMALL_BUFFER_H
+#define KLEIDICV_SMALL_BUFFER_H
+
+#include <cstdlib>
+
+namespace kleidicv {
+
+// A buffer that goes on the stack if it's small enough, and otherwise is
+// allocated on the heap. Similar to the "small vector" idea, but its size is
+// fixed at creation time.
+template <typename T, size_t SizeOnStack>
+class SmallBuffer {
+ public:
+  static_assert(std::is_trivial_v<T>);
+
+  explicit SmallBuffer(size_t size)
+      : ptr_(size <= SizeOnStack
+                 ? buf_
+                 : reinterpret_cast<T *>(std::malloc(size * sizeof(T)))) {}
+
+  // non-copyable
+  SmallBuffer(const SmallBuffer &) = delete;
+  SmallBuffer &operator=(const SmallBuffer &) = delete;
+
+  ~SmallBuffer() {
+    if (ptr_ != buf_) {
+      std::free(ptr_);
+    }
+  }
+
+  T *get() { return ptr_; }
+
+ private:
+  T *ptr_;
+  T buf_[SizeOnStack];
+};
+
+}  // namespace kleidicv
+
+#endif  // KLEIDICV_SMALL_BUFFER_H
diff --git a/kleidicv_opencv/CMakeLists.txt b/kleidicv_opencv/CMakeLists.txt
new file mode 100644
index 000000000..238856118
--- /dev/null
+++ b/kleidicv_opencv/CMakeLists.txt
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.16)
+
+project("KleidiCV OpenCV")
+
+file(GLOB KLEIDICV_OPENCV_API_SOURCES
+  "${CMAKE_CURRENT_LIST_DIR}/src/*_api.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/src/**/*_api.cpp"
+)
+
+file(GLOB KLEIDICV_OPENCV_NEON_SOURCES
+  "${CMAKE_CURRENT_LIST_DIR}/src/*_neon.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/src/**/*_neon.cpp"
+)
+
+file(GLOB KLEIDICV_OPENCV_SVE2_SOURCES
+  "${CMAKE_CURRENT_LIST_DIR}/src/*_sve2.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/src/**/*_sve2.cpp"
+)
+
+file(GLOB KLEIDICV_OPENCV_SME2_SOURCES
+  "${CMAKE_CURRENT_LIST_DIR}/src/*_sme2.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/src/**/*_sme2.cpp"
+)
+
+set(KLEIDICV_OPENCV_INCLUDE_DIRS
+  "${CMAKE_CURRENT_LIST_DIR}/include"
+  $<TARGET_PROPERTY:kleidicv,INCLUDE_DIRECTORIES>
+)
+
+set(KLEIDICV_OPENCV_WARNING_FLAGS
+  "-Wall"
+  "-Wextra"
+  "-Wold-style-cast"
+  "-Wno-shadow" # GCC's shadow declaration check is too sensitive for the library
+)
+
+set(KLEIDICV_OPENCV_CXX_FLAGS
+  "-O2"
+  "-g0"
+  "-fomit-frame-pointer"
+  "-fno-stack-protector"
+  "-fno-exceptions"
+  "-fno-rtti"
+  "-fno-unroll-loops"
+  ${KLEIDICV_OPENCV_WARNING_FLAGS}
+)
+
+if(CMAKE_CXX_COMPILER_ID MATCHES ".*Clang")
+  list(APPEND KLEIDICV_OPENCV_CXX_FLAGS
+    "-mllvm"
+    "-inline-threshold=10000"
+  )
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  list(APPEND KLEIDICV_OPENCV_CXX_FLAGS
+    "-flax-vector-conversions"
+    "-Wno-unused-label"
+  )
+endif()
+
+if (KLEIDICV_CHECK_BANNED_FUNCTIONS)
+  # The `SHELL:` prefix is used to turn off de-duplication of compiler flags,
+  # it is necessary if other headers are need to be force included.
+  # https://cmake.org/cmake/help/latest/command/target_compile_options.html#option-de-duplication
+  list(APPEND KLEIDICV_OPENCV_CXX_FLAGS "SHELL:-include kleidicv/unsafe.h")
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+  list(APPEND KLEIDICV_OPENCV_CXX_FLAGS "-O0" "-g")
+else()
+  list(APPEND KLEIDICV_OPENCV_CXX_FLAGS "-O2" "-g0")
+endif()
+
+add_library(kleidicv_opencv_neon OBJECT ${KLEIDICV_OPENCV_NEON_SOURCES})
+target_include_directories(kleidicv_opencv_neon PRIVATE ${KLEIDICV_OPENCV_INCLUDE_DIRS})
+set_target_properties(kleidicv_opencv_neon PROPERTIES CXX_STANDARD 17)
+target_compile_options(kleidicv_opencv_neon PRIVATE
+  ${KLEIDICV_OPENCV_CXX_FLAGS}
+  "-march=armv8-a"
+  "-DKLEIDICV_TARGET_NEON=1"
+)
+
+if(KLEIDICV_ENABLE_SVE2)
+  add_library(kleidicv_opencv_sve2 OBJECT ${KLEIDICV_OPENCV_SVE2_SOURCES})
+  target_include_directories(kleidicv_opencv_sve2 PRIVATE ${KLEIDICV_OPENCV_INCLUDE_DIRS})
+  set_target_properties(kleidicv_opencv_sve2 PROPERTIES CXX_STANDARD 17)
+  target_compile_options(kleidicv_opencv_sve2 PRIVATE
+    ${KLEIDICV_OPENCV_CXX_FLAGS}
+    "-march=armv8-a+sve2"
+    "-DKLEIDICV_TARGET_SVE2=1"
+  )
+endif()
+
+if(KLEIDICV_ENABLE_SME2)
+  add_library(kleidicv_opencv_sme2 OBJECT ${KLEIDICV_OPENCV_SME2_SOURCES})
+  target_include_directories(kleidicv_opencv_sme2 PRIVATE ${KLEIDICV_OPENCV_INCLUDE_DIRS})
+  set_target_properties(kleidicv_opencv_sme2 PROPERTIES CXX_STANDARD 17)
+  target_compile_options(kleidicv_opencv_sme2 PRIVATE
+    ${KLEIDICV_OPENCV_CXX_FLAGS}
+    "-march=armv9-a+sve2+sme2+nosimd"
+    "-DKLEIDICV_TARGET_SME2=1"
+  )
+endif()
+
+add_library(kleidicv_opencv STATIC ${KLEIDICV_OPENCV_API_SOURCES})
+target_include_directories(kleidicv_opencv PRIVATE ${KLEIDICV_OPENCV_INCLUDE_DIRS})
+set_target_properties(kleidicv_opencv PROPERTIES CXX_STANDARD 17)
+target_compile_options(kleidicv_opencv PRIVATE ${KLEIDICV_OPENCV_CXX_FLAGS})
+target_link_libraries(kleidicv_opencv PRIVATE kleidicv_opencv_neon)
+
+if(KLEIDICV_ENABLE_SVE2)
+  target_compile_definitions(kleidicv_opencv PRIVATE KLEIDICV_HAVE_SVE2)
+  target_link_libraries(kleidicv_opencv PRIVATE kleidicv_opencv_sve2)
+endif()
+
+if(KLEIDICV_ENABLE_SME2)
+  target_compile_definitions(kleidicv_opencv PRIVATE KLEIDICV_HAVE_SME2)
+  target_link_libraries(kleidicv_opencv PRIVATE kleidicv_opencv_sme2)
+endif()
diff --git a/kleidicv_opencv/README.md b/kleidicv_opencv/README.md
new file mode 100644
index 000000000..92112d5e3
--- /dev/null
+++ b/kleidicv_opencv/README.md
@@ -0,0 +1,12 @@
+<!--
+SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# kleidicv_opencv
+
+This library provides functionality that is specific to OpenCV's Hardware
+Abstraction Layer (HAL), and is not intended to be useful outside that context.
+It differs from the code in the `adapters/opencv` directory in that it can be
+built and tested independently of OpenCV.
diff --git a/kleidicv_opencv/include/kleidicv_opencv/config.h b/kleidicv_opencv/include/kleidicv_opencv/config.h
new file mode 100644
index 000000000..a6353a844
--- /dev/null
+++ b/kleidicv_opencv/include/kleidicv_opencv/config.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_CONFIG_H
+#define KLEIDICV_OPENCV_CONFIG_H
+
+#if KLEIDICV_TARGET_NEON
+#define KLEIDICV_OPENCV_TARGET_NAMESPACE kleidicv_opencv::neon
+#elif KLEIDICV_TARGET_SVE2
+#define KLEIDICV_OPENCV_TARGET_NAMESPACE kleidicv_opencv::sve2
+#elif KLEIDICV_TARGET_SME2
+#define KLEIDICV_OPENCV_TARGET_NAMESPACE kleidicv_opencv::sme2
+#endif
+
+#endif  // KLEIDICV_OPENCV_CONFIG_H
diff --git a/kleidicv_opencv/include/kleidicv_opencv/kleidicv_opencv.h b/kleidicv_opencv/include/kleidicv_opencv/kleidicv_opencv.h
new file mode 100644
index 000000000..275040814
--- /dev/null
+++ b/kleidicv_opencv/include/kleidicv_opencv/kleidicv_opencv.h
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_H
+#define KLEIDICV_OPENCV_H
+
+#include "kleidicv/kleidicv.h"
+#include "kleidicv_opencv/config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// Internal - not part of the public API and its direct use is not supported.
+/// For details see
+/// https://github.com/opencv/opencv/blob/4.11.0/modules/video/src/hal_replacement.hpp#L29
+KLEIDICV_API_DECLARATION(kleidicv_opencv_optical_flow_u8,
+                         const uint8_t *prev_data, size_t prev_data_step,
+                         const int16_t *prev_deriv_data, size_t prev_deriv_step,
+                         const uint8_t *next_data, size_t next_step, int width,
+                         int height, int channels, const float *prev_points,
+                         float *next_points, size_t point_count,
+                         uint8_t *status, float *err, int window_width,
+                         int window_height, int termination_count,
+                         double termination_epsilon, bool get_min_eigen_vals,
+                         float min_eigen_vals_threshold);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // KLEIDICV_OPENCV_H
diff --git a/kleidicv_opencv/include/kleidicv_opencv/optical_flow.h b/kleidicv_opencv/include/kleidicv_opencv/optical_flow.h
new file mode 100644
index 000000000..5577dad88
--- /dev/null
+++ b/kleidicv_opencv/include/kleidicv_opencv/optical_flow.h
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_OPTICAL_FLOW_H
+#define KLEIDICV_OPENCV_OPTICAL_FLOW_H
+
+#include "kleidicv/kleidicv.h"
+
+namespace kleidicv_opencv {
+
+namespace neon {
+
+kleidicv_error_t optical_flow_u8(
+    const uint8_t *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uint8_t *next_data, size_t next_step, int width, int height,
+    int channels, const float *prev_points, float *next_points,
+    size_t point_count, uint8_t *status, float *err, int window_width,
+    int window_height, int termination_count, double termination_epsilon,
+    bool get_min_eigen_vals, float min_eigen_vals_threshold);
+
+}  // namespace neon
+
+namespace sve2 {
+
+kleidicv_error_t optical_flow_u8(
+    const uint8_t *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uint8_t *next_data, size_t next_step, int width, int height,
+    int channels, const float *prev_points, float *next_points,
+    size_t point_count, uint8_t *status, float *err, int window_width,
+    int window_height, int termination_count, double termination_epsilon,
+    bool get_min_eigen_vals, float min_eigen_vals_threshold);
+
+}  // namespace sve2
+
+namespace sme2 {
+
+kleidicv_error_t optical_flow_u8(
+    const uint8_t *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uint8_t *next_data, size_t next_step, int width, int height,
+    int channels, const float *prev_points, float *next_points,
+    size_t point_count, uint8_t *status, float *err, int window_width,
+    int window_height, int termination_count, double termination_epsilon,
+    bool get_min_eigen_vals, float min_eigen_vals_threshold);
+
+}  // namespace sme2
+
+}  // namespace kleidicv_opencv
+
+#endif  // KLEIDICV_OPENCV_OPTICAL_FLOW_H
diff --git a/kleidicv_opencv/src/optical_flow/optical_flow_api.cpp b/kleidicv_opencv/src/optical_flow/optical_flow_api.cpp
new file mode 100644
index 000000000..86a6c693b
--- /dev/null
+++ b/kleidicv_opencv/src/optical_flow/optical_flow_api.cpp
@@ -0,0 +1,12 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/dispatch.h"
+#include "kleidicv_opencv/kleidicv_opencv.h"
+#include "kleidicv_opencv/optical_flow.h"
+
+KLEIDICV_MULTIVERSION_C_API(
+    kleidicv_opencv_optical_flow_u8, &kleidicv_opencv::neon::optical_flow_u8,
+    KLEIDICV_SVE2_IMPL_IF(&kleidicv_opencv::sve2::optical_flow_u8),
+    KLEIDICV_SME2_IMPL_IF(&kleidicv_opencv::sme2::optical_flow_u8));
diff --git a/kleidicv_opencv/src/optical_flow/optical_flow_common.h b/kleidicv_opencv/src/optical_flow/optical_flow_common.h
new file mode 100644
index 000000000..b1edfb196
--- /dev/null
+++ b/kleidicv_opencv/src/optical_flow/optical_flow_common.h
@@ -0,0 +1,238 @@
+// SPDX-FileCopyrightText: 2000, Intel Corporation, all rights reserved.
+// SPDX-FileCopyrightText: 2013, OpenCV Foundation, all rights reserved.
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: BSD-3-Clause AND Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_OPTICAL_FLOW_COMMON_H
+#define KLEIDICV_OPENCV_OPTICAL_FLOW_COMMON_H
+
+#include <array>
+#include <cfloat>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "kleidicv/containers/small_buffer.h"
+#include "kleidicv_opencv/kleidicv_opencv.h"
+#include "kleidicv_opencv/optical_flow.h"
+
+#define KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS 14
+#define KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE (1.0F / (1 << 20))
+
+namespace KLEIDICV_OPENCV_TARGET_NAMESPACE {
+
+template <int shift>
+inline std::array<int16_t, 4> get_lerp_params(float x, float y)
+    KLEIDICV_STREAMING_COMPATIBLE {
+  int16_t a =
+      static_cast<int16_t>(rintf((1.0F - x) * (1.0F - y) * (1 << shift)));
+  int16_t b = static_cast<int16_t>(rintf(x * (1.0F - y) * (1 << shift)));
+  int16_t c = static_cast<int16_t>(rintf((1.0F - x) * y * (1 << shift)));
+  int16_t d = (1 << shift) - a - b - c;
+  return {a, b, c, d};
+}
+
+template <int fraction_bits>
+inline int round_fixed_point(int x) KLEIDICV_STREAMING_COMPATIBLE {
+  int half = 1 << (fraction_bits - 1);
+  return (x + half) >> fraction_bits;
+}
+
+// This function is too complex, but disable the warning for now.
+// NOLINTBEGIN(readability-function-cognitive-complexity)
+template <typename Impl>
+KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t optical_flow_common(
+    int16_t *window, int16_t *scharr_window, const uint8_t *prev_data,
+    size_t prev_data_stride, const int16_t *scharr_data,
+    size_t scharr_stride_bytes, const uint8_t *next_data, size_t next_stride,
+    int width, int height, int channels, const float *prev_points,
+    float *next_points, size_t point_count, uint8_t *status, float *err,
+    int window_width, int window_height, int termination_count,
+    double termination_epsilon, bool get_min_eigen_vals,
+    float min_eigen_vals_threshold) KLEIDICV_STREAMING_COMPATIBLE {
+  if (channels != 1) {
+    return KLEIDICV_ERROR_NOT_IMPLEMENTED;
+  }
+
+  auto point_out_of_bounds = [&](int x, int y) {
+    return x < -window_width || x >= width || y < -window_height || y >= height;
+  };
+
+  const ptrdiff_t scharr_stride_elements =
+      static_cast<ptrdiff_t>(scharr_stride_bytes / sizeof(int16_t));
+
+  const float half_window_width = static_cast<float>(window_width - 1) * 0.5F;
+  const float half_window_height = static_cast<float>(window_height - 1) * 0.5F;
+
+  for (size_t point_index = 0; point_index < point_count; point_index++) {
+    const float prev_x = prev_points[point_index * 2] - half_window_width;
+    const float prev_y = prev_points[point_index * 2 + 1] - half_window_height;
+    const int prev_xi = static_cast<int>(floorf(prev_x));
+    const int prev_yi = static_cast<int>(floorf(prev_y));
+
+    if (point_out_of_bounds(prev_xi, prev_yi)) {
+      if (status) {
+        status[point_index] = false;
+      }
+      continue;
+    }
+
+    const auto [coeff_tl, coeff_tr, coeff_bl, coeff_br] =
+        get_lerp_params<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+            prev_x - static_cast<float>(prev_xi),
+            prev_y - static_cast<float>(prev_yi));
+
+    Impl::get_window(window, prev_data, prev_data_stride, channels, prev_xi,
+                     prev_yi, window_width, window_height, coeff_tl, coeff_tr,
+                     coeff_bl, coeff_br);
+
+    float sum_scharr_xx = 0, sum_scharr_xy = 0, sum_scharr_yy = 0;
+    Impl::get_scharr(scharr_window, scharr_data, scharr_stride_elements,
+                     channels, prev_xi, prev_yi, window_width, window_height,
+                     coeff_tl, coeff_tr, coeff_bl, coeff_br, sum_scharr_xx,
+                     sum_scharr_xy, sum_scharr_yy);
+
+    const float determinant =
+        sum_scharr_xx * sum_scharr_yy - powf(sum_scharr_xy, 2);
+    float min_eigen_val = (sum_scharr_yy + sum_scharr_xx -
+                           sqrtf(powf(sum_scharr_xx - sum_scharr_yy, 2) +
+                                 4 * powf(sum_scharr_xy, 2))) /
+                          static_cast<float>(2L * window_width * window_height);
+
+    if (err && get_min_eigen_vals) {
+      err[point_index] = min_eigen_val;
+    }
+
+    if (min_eigen_val < min_eigen_vals_threshold || determinant < FLT_EPSILON) {
+      if (status) {
+        status[point_index] = false;
+      }
+      continue;
+    }
+
+    const float inverse_determinant = 1.0F / determinant;
+
+    float next_x = next_points[point_index * 2] - half_window_width;
+    float next_y = next_points[point_index * 2 + 1] - half_window_height;
+    float prev_velocity_x = 0, prev_velocity_y = 0;
+
+    for (int j = 0; j < termination_count; j++) {
+      const int next_xi = static_cast<int>(floorf(next_x));
+      const int next_yi = static_cast<int>(floorf(next_y));
+
+      if (point_out_of_bounds(next_xi, next_yi)) {
+        if (status) {
+          status[point_index] = false;
+        }
+        break;
+      }
+
+      const auto [coeff_tl, coeff_tr, coeff_bl, coeff_br] =
+          get_lerp_params<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+              next_x - static_cast<float>(next_xi),
+              next_y - static_cast<float>(next_yi));
+
+      float sum_diff_scharr_x = 0, sum_diff_scharr_y = 0;
+      Impl::get_sum_diff_scharr(next_data, next_stride, window, scharr_window,
+                                channels, next_xi, next_yi, window_width,
+                                window_height, coeff_tl, coeff_tr, coeff_bl,
+                                coeff_br, sum_diff_scharr_x, sum_diff_scharr_y);
+
+      const float velocity_x = (sum_scharr_xy * sum_diff_scharr_y -
+                                sum_scharr_yy * sum_diff_scharr_x) *
+                               inverse_determinant;
+      const float velocity_y = (sum_scharr_xy * sum_diff_scharr_x -
+                                sum_scharr_xx * sum_diff_scharr_y) *
+                               inverse_determinant;
+
+      next_x += velocity_x;
+      next_y += velocity_y;
+      next_points[point_index * 2] = next_x + half_window_width;
+      next_points[point_index * 2 + 1] = next_y + half_window_height;
+
+      if (velocity_x * velocity_x + velocity_y * velocity_y <=
+          termination_epsilon) {
+        break;
+      }
+
+      if (j != 0 && fabsf(velocity_x + prev_velocity_x) < 0.01F &&
+          fabsf(velocity_y + prev_velocity_y) < 0.01F) {
+        next_points[point_index * 2] -= velocity_x * 0.5F;
+        next_points[point_index * 2 + 1] -= velocity_y * 0.5F;
+        break;
+      }
+      prev_velocity_x = velocity_x;
+      prev_velocity_y = velocity_y;
+    }
+
+    if (status && status[point_index]) {
+      next_x = next_points[point_index * 2] - half_window_width;
+      next_y = next_points[point_index * 2 + 1] - half_window_height;
+      const int next_xi = static_cast<int>(floorf(next_x));
+      const int next_yi = static_cast<int>(floorf(next_y));
+
+      if (point_out_of_bounds(next_xi, next_yi)) {
+        status[point_index] = false;
+      } else if (err && !get_min_eigen_vals) {
+        const auto [coeff_tl, coeff_tr, coeff_bl, coeff_br] =
+            get_lerp_params<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+                next_x - static_cast<float>(next_xi),
+                next_y - static_cast<float>(next_yi));
+        float errval = 0;
+
+        for (int y = 0; y < window_height; y++) {
+          int16_t *window_row =
+              window + static_cast<ptrdiff_t>(y) * window_width;
+          const uint8_t *next_row0 =
+              next_data + (y + next_yi) * static_cast<ptrdiff_t>(next_stride) +
+              static_cast<ptrdiff_t>(next_xi) * channels;
+          const uint8_t *next_row1 = next_row0 + next_stride;
+
+          for (int x = 0; x < window_width * channels; x++) {
+            int diff =
+                round_fixed_point<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS -
+                                  5>(next_row0[x] * coeff_tl +
+                                     next_row0[x + channels] * coeff_tr +
+                                     next_row1[x] * coeff_bl +
+                                     next_row1[x + channels] * coeff_br) -
+                window_row[x];
+            errval += fabsf(static_cast<float>(diff));
+          }
+        }
+        err[point_index] =
+            errval * 1.0F /
+            static_cast<float>(32L * window_width * channels * window_height);
+      }
+    }
+  }
+  return KLEIDICV_OK;
+}
+// NOLINTEND(readability-function-cognitive-complexity)
+
+// Allocates optical flow's window buffers.
+// If the required buffer size is no greater than StackBufferSize then the
+// buffers will be allocated on the stack. Typical window width is 21.
+template <int StackBufferSize = 21UL * 21UL * 3UL>
+class OpticalFlowWindowBuffer {
+ public:
+  OpticalFlowWindowBuffer(int window_width, int window_height, int channels)
+      : window_buffer_(static_cast<size_t>(window_width) * window_height *
+                       channels * 3UL),
+        deriv_window_(window_buffer_.get()
+                          ? window_buffer_.get() +
+                                static_cast<size_t>(window_width) *
+                                    window_height * channels
+                          : nullptr) {}
+  int16_t *window() { return window_buffer_.get(); }
+  int16_t *deriv_window() { return deriv_window_; }
+
+ private:
+  kleidicv::SmallBuffer<int16_t, StackBufferSize> window_buffer_;
+  int16_t *deriv_window_;
+};
+
+}  // namespace KLEIDICV_OPENCV_TARGET_NAMESPACE
+
+#endif  // KLEIDICV_OPENCV_OPTICAL_FLOW_COMMON_H
diff --git a/kleidicv_opencv/src/optical_flow/optical_flow_neon.cpp b/kleidicv_opencv/src/optical_flow/optical_flow_neon.cpp
new file mode 100644
index 000000000..09d1dd07d
--- /dev/null
+++ b/kleidicv_opencv/src/optical_flow/optical_flow_neon.cpp
@@ -0,0 +1,608 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv/neon.h"
+#include "kleidicv_opencv/optical_flow.h"
+#include "optical_flow_common.h"
+
+namespace kleidicv_opencv::neon {
+
+struct OpticalFlow {
+  // Like SVE's svld1ub_s16 but for Neon.
+  // Loads 4 bytes and widens them to int16.
+  static inline int16x4_t vld1ub_s16(const uint8_t *ptr) {
+    uint32_t a = 0;
+    memcpy(&a, ptr, sizeof(a));
+    uint32x2_t b = vset_lane_u32(a, vdup_n_s32(0), 0);
+    uint8x8_t c = vreinterpret_u8_u32(b);
+    return vget_low_s16(vmovl_u8(c));
+  }
+
+  template <int shift>
+  static inline int16x4_t lerp(int16x4_t tl, int16x4_t tr, int16x4_t bl,
+                               int16x4_t br, int16x4_t coeff_tl,
+                               int16x4_t coeff_tr, int16x4_t coeff_bl,
+                               int16x4_t coeff_br) {
+    int32x4_t accumulator = vmull_s16(tl, coeff_tl);
+    accumulator = vmlal_s16(accumulator, tr, coeff_tr);
+    accumulator = vmlal_s16(accumulator, bl, coeff_bl);
+    accumulator = vmlal_s16(accumulator, br, coeff_br);
+    return vqrshrn_n_s32(accumulator, shift);
+  }
+
+  struct OpticalFlowGetWindowProcessRows {
+    int16_t *window;
+    const uint8_t *prev_data;
+    ptrdiff_t prev_data_stride;
+    int channels, window_corner_x, window_corner_y, window_width, window_height;
+
+    template <typename F>
+    void operator()(F f) const {
+      for (int y = 0; y < window_height; y++) {
+        const uint8_t *const prev_row0 =
+            prev_data + (y + window_corner_y) * prev_data_stride +
+            static_cast<ptrdiff_t>(window_corner_x) * channels;
+        const uint8_t *const prev_row1 = prev_row0 + prev_data_stride;
+
+        int16_t *const window_row =
+            window + static_cast<ptrdiff_t>(y) * window_width;
+
+        f(prev_row0, prev_row1, window_row);
+      }
+    }
+  };
+
+  // NOLINTBEGIN(readability-function-cognitive-complexity)
+  static inline void get_window(int16_t *window, const uint8_t *prev_data,
+                                ptrdiff_t prev_data_stride, int channels,
+                                int window_corner_x, int window_corner_y,
+                                int window_width, int window_height,
+                                int16_t coeff_tl, int16_t coeff_tr,
+                                int16_t coeff_bl, int16_t coeff_br) {
+    const int16x4_t coeff_tl_v = vdup_n_s16(coeff_tl);
+    const int16x4_t coeff_tr_v = vdup_n_s16(coeff_tr);
+    const int16x4_t coeff_bl_v = vdup_n_s16(coeff_bl);
+    const int16x4_t coeff_br_v = vdup_n_s16(coeff_br);
+
+    auto process1 = [&](const uint8_t *prev_row0, const uint8_t *prev_row1,
+                        int16_t *window_row, int x) {
+      window_row[x] = static_cast<int16_t>(
+          round_fixed_point<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+              prev_row0[x] * coeff_tl + prev_row0[x + channels] * coeff_tr +
+              prev_row1[x] * coeff_bl + prev_row1[x + channels] * coeff_br));
+    };
+    auto process4 = [&](const uint8_t *prev_row0, const uint8_t *prev_row1,
+                        int16_t *window_row, int x) {
+      int16x4_t prev_tl = vld1ub_s16(prev_row0 + x);
+      int16x4_t prev_tr = vld1ub_s16(prev_row0 + x + channels);
+      int16x4_t prev_bl = vld1ub_s16(prev_row1 + x);
+      int16x4_t prev_br = vld1ub_s16(prev_row1 + x + channels);
+
+      int16x4_t prev = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+          prev_tl, prev_tr, prev_bl, prev_br, coeff_tl_v, coeff_tr_v,
+          coeff_bl_v, coeff_br_v);
+      vst1_s16(window_row + x, prev);
+    };
+    auto process8 = [&](const uint8_t *prev_row0, const uint8_t *prev_row1,
+                        int16_t *window_row, int x) {
+      int16x8_t prev_tl =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(prev_row0 + x)));
+      int16x8_t prev_tr =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(prev_row0 + x + channels)));
+      int16x8_t prev_bl =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(prev_row1 + x)));
+      int16x8_t prev_br =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(prev_row1 + x + channels)));
+
+      int16x4_t prev_low = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+          vget_low_u16(prev_tl), vget_low_u16(prev_tr), vget_low_u16(prev_bl),
+          vget_low_u16(prev_br), coeff_tl_v, coeff_tr_v, coeff_bl_v,
+          coeff_br_v);
+      int16x4_t prev_high =
+          lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+              vget_high_u16(prev_tl), vget_high_u16(prev_tr),
+              vget_high_u16(prev_bl), vget_high_u16(prev_br), coeff_tl_v,
+              coeff_tr_v, coeff_bl_v, coeff_br_v);
+      vst1q_s16(window_row + x, vcombine_s16(prev_low, prev_high));
+    };
+
+    OpticalFlowGetWindowProcessRows process_rows = {
+        window,          prev_data,       prev_data_stride, channels,
+        window_corner_x, window_corner_y, window_width,     window_height};
+
+    if (window_width * channels <= 8) {
+      // Handle small window width
+      if (window_width * channels > 4) {
+        process_rows(
+            [&](const uint8_t *row0, const uint8_t *row1, int16_t *window_row) {
+              process4(row0, row1, window_row, 0);
+              process4(row0, row1, window_row, window_width * channels - 4);
+            });
+      } else {
+        // Handle tiny window width
+        process_rows(
+            [&](const uint8_t *row0, const uint8_t *row1, int16_t *window_row) {
+              for (int x = 0; x < window_width * channels; ++x) {
+                process1(row0, row1, window_row, x);
+              }
+            });
+      }
+    } else {
+      switch (window_width * channels & 7) {
+        case 1:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           int16_t *window_row) {
+            int x = 0;
+            for (; x + 8 <= window_width * channels; x += 8) {
+              process8(row0, row1, window_row, x);
+            }
+            process1(row0, row1, window_row, x);
+          });
+          break;
+        case 5:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           int16_t *window_row) {
+            int x = 0;
+            for (; x + 8 <= window_width * channels; x += 8) {
+              process8(row0, row1, window_row, x);
+            }
+            process4(row0, row1, window_row, x);
+            process1(row0, row1, window_row, x + 4);
+          });
+          break;
+        case 6:
+        case 7:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           int16_t *window_row) {
+            for (int x = 0; x + 8 <= window_width * channels; x += 8) {
+              process8(row0, row1, window_row, x);
+            }
+            process8(row0, row1, window_row, window_width * channels - 8);
+          });
+          break;
+        default:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           int16_t *window_row) {
+            for (int x = 0; x + 8 <= window_width * channels; x += 8) {
+              process8(row0, row1, window_row, x);
+            }
+            process4(row0, row1, window_row, window_width * channels - 4);
+          });
+          break;
+      }
+    }
+  }
+  // NOLINTEND(readability-function-cognitive-complexity)
+
+  struct OpticalFlowGetScharrProcessRows {
+    int16_t *scharr_window;
+    const int16_t *scharr_data;
+    ptrdiff_t scharr_stride_elements;
+    int channels, window_corner_x, window_corner_y, window_width, window_height;
+
+    template <typename F>
+    void operator()(F f) const {
+      for (int y = 0; y < window_height; y++) {
+        const int16_t *const scharr_row0 =
+            scharr_data + (y + window_corner_y) * scharr_stride_elements +
+            static_cast<ptrdiff_t>(window_corner_x) * channels * 2L;
+        const int16_t *const scharr_row1 = scharr_row0 + scharr_stride_elements;
+
+        int16_t *const scharr_window_row =
+            scharr_window + static_cast<ptrdiff_t>(y) * window_width * 2L;
+
+        f(scharr_row0, scharr_row1, scharr_window_row);
+      }
+    }
+  };
+
+  static inline void get_scharr(
+      int16_t *scharr_window, const int16_t *scharr_data,
+      ptrdiff_t scharr_stride_elements, int channels, int window_corner_x,
+      int window_corner_y, int window_width, int window_height,
+      int16_t coeff_tl, int16_t coeff_tr, int16_t coeff_bl, int16_t coeff_br,
+      float &sum_scharr_xx, float &sum_scharr_xy, float &sum_scharr_yy) {
+    sum_scharr_xx = 0;
+    sum_scharr_xy = 0;
+    sum_scharr_yy = 0;
+
+    float32x4_t sum_scharr_xx_v = vdupq_n_f32(0),
+                sum_scharr_xy_v = vdupq_n_f32(0),
+                sum_scharr_yy_v = vdupq_n_f32(0);
+
+    const int16x4_t coeff_tl_v = vdup_n_s16(coeff_tl);
+    const int16x4_t coeff_tr_v = vdup_n_s16(coeff_tr);
+    const int16x4_t coeff_bl_v = vdup_n_s16(coeff_bl);
+    const int16x4_t coeff_br_v = vdup_n_s16(coeff_br);
+
+    auto make_leftover_accumulate_mask_s16 = [](int width) {
+      int count3 = width & 3;
+      int count4 = count3 == 0 ? 4 : count3;
+      return vcreate_s16(0x000000000000ffffULL * (count4 >= 4) +
+                         0x00000000ffff0000ULL * (count4 >= 3) +
+                         0x0000ffff00000000ULL * (count4 >= 2) +
+                         0xffff000000000000ULL * (count4 >= 1));
+    };
+    const int16x4_t leftover_mask =
+        make_leftover_accumulate_mask_s16(window_width * channels);
+
+    auto process1_and_accumulate = [&](const int16_t *row0, const int16_t *row1,
+                                       int16_t *scharr_window_row, int x) {
+      int scharr_x =
+          round_fixed_point<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+              row0[x * 2L] * coeff_tl + row0[(x + channels) * 2L] * coeff_tr +
+              row1[x * 2L] * coeff_bl + row1[(x + channels) * 2L] * coeff_br);
+      int scharr_y =
+          round_fixed_point<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+              row0[x * 2L + 1] * coeff_tl +
+              row0[(x + channels) * 2L + 1] * coeff_tr +
+              row1[x * 2L + 1] * coeff_bl +
+              row1[(x + channels) * 2L + 1] * coeff_br);
+
+      scharr_window_row[x * 2L] = static_cast<int16_t>(scharr_x);
+      scharr_window_row[x * 2L + 1] = static_cast<int16_t>(scharr_y);
+
+      sum_scharr_xx += static_cast<float>(scharr_x * scharr_x);
+      sum_scharr_xy += static_cast<float>(scharr_x * scharr_y);
+      sum_scharr_yy += static_cast<float>(scharr_y * scharr_y);
+    };
+    auto process4 = [&](const int16_t *row0, const int16_t *row1,
+                        int16_t *scharr_window_row, int x) {
+      int16x4x2_t scharr_tl = vld2_s16(row0 + x * 2L);
+      int16x4x2_t scharr_tr = vld2_s16(row0 + (x + channels) * 2L);
+      int16x4x2_t scharr_bl = vld2_s16(row1 + x * 2L);
+      int16x4x2_t scharr_br = vld2_s16(row1 + (x + channels) * 2L);
+
+      int16x4_t scharr_x = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+          scharr_tl.val[0], scharr_tr.val[0], scharr_bl.val[0],
+          scharr_br.val[0], coeff_tl_v, coeff_tr_v, coeff_bl_v, coeff_br_v);
+
+      int16x4_t scharr_y = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+          scharr_tl.val[1], scharr_tr.val[1], scharr_bl.val[1],
+          scharr_br.val[1], coeff_tl_v, coeff_tr_v, coeff_bl_v, coeff_br_v);
+
+      int16x4x2_t scharr_xy{scharr_x, scharr_y};
+
+      vst2_s16(scharr_window_row + x * 2L, scharr_xy);
+
+      return scharr_xy;
+    };
+    auto accumulate = [&](int16x4_t scharr_x, int16x4_t scharr_y) {
+      // sum_scharr_xx += scharr_x * scharr_x;
+      sum_scharr_xx_v = vaddq_f32(sum_scharr_xx_v,
+                                  vcvtq_f32_s32(vmull_s16(scharr_x, scharr_x)));
+
+      // sum_scharr_xy += scharr_x * scharr_y;
+      sum_scharr_xy_v = vaddq_f32(sum_scharr_xy_v,
+                                  vcvtq_f32_s32(vmull_s16(scharr_x, scharr_y)));
+
+      // sum_scharr_yy += scharr_y * scharr_y;
+      sum_scharr_yy_v = vaddq_f32(sum_scharr_yy_v,
+                                  vcvtq_f32_s32(vmull_s16(scharr_y, scharr_y)));
+    };
+    auto process4_and_accumulate = [&](const int16_t *row0, const int16_t *row1,
+                                       int16_t *scharr_window_row, int x) {
+      int16x4x2_t scharr_xy = process4(row0, row1, scharr_window_row, x);
+      accumulate(scharr_xy.val[0], scharr_xy.val[1]);
+    };
+    auto process4_and_accumulate_leftovers = [&](const int16_t *row0,
+                                                 const int16_t *row1,
+                                                 int16_t *scharr_window_row) {
+      int16x4x2_t scharr_xy =
+          process4(row0, row1, scharr_window_row, window_width * channels - 4);
+      int16x4_t masked_x = vand_s16(scharr_xy.val[0], leftover_mask);
+      int16x4_t masked_y = vand_s16(scharr_xy.val[1], leftover_mask);
+      accumulate(masked_x, masked_y);
+    };
+
+    OpticalFlowGetScharrProcessRows process_rows = {
+        scharr_window, scharr_data,     scharr_stride_elements,
+        channels,      window_corner_x, window_corner_y,
+        window_width,  window_height};
+
+    if ((window_width * channels & 3) == 1) {
+      // Optimization for the most common case that the window width is a
+      // multiple of four plus one.
+      process_rows(
+          [&](const int16_t *row0, const int16_t *row1, int16_t *window_row) {
+            int x = 0;
+            for (; x + 4 <= window_width * channels; x += 4) {
+              process4_and_accumulate(row0, row1, window_row, x);
+            }
+            process1_and_accumulate(row0, row1, window_row, x);
+          });
+    } else if (window_width * channels <= 4) {
+      // Handle small window width
+      process_rows(
+          [&](const int16_t *row0, const int16_t *row1, int16_t *window_row) {
+            for (int x = 0; x < window_width * channels; ++x) {
+              process1_and_accumulate(row0, row1, window_row, x);
+            }
+          });
+    } else {
+      process_rows(
+          [&](const int16_t *row0, const int16_t *row1, int16_t *window_row) {
+            for (int x = 0; x + 4 < window_width * channels; x += 4) {
+              process4_and_accumulate(row0, row1, window_row, x);
+            }
+            process4_and_accumulate_leftovers(row0, row1, window_row);
+          });
+    }
+
+    sum_scharr_xx += vaddvq_f32(sum_scharr_xx_v);
+    sum_scharr_xy += vaddvq_f32(sum_scharr_xy_v);
+    sum_scharr_yy += vaddvq_f32(sum_scharr_yy_v);
+    sum_scharr_xx *= KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+    sum_scharr_xy *= KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+    sum_scharr_yy *= KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+  }
+
+  struct OpticalFlowGetSumDiffScharrProcessRows {
+    const uint8_t *next_data;
+    ptrdiff_t next_stride;
+    const int16_t *window;
+    const int16_t *scharr_window;
+    int channels, window_corner_x, window_corner_y, window_width, window_height;
+
+    template <typename F>
+    void operator()(F f) const {
+      for (int y = 0; y < window_height; y++) {
+        const uint8_t *row0 =
+            next_data + (y + window_corner_y) * next_stride +
+            static_cast<ptrdiff_t>(window_corner_x) * channels;
+        const uint8_t *row1 = row0 + next_stride;
+        const int16_t *window_row =
+            window + static_cast<ptrdiff_t>(y) * window_width;
+        const int16_t *scharr_window_row =
+            scharr_window + static_cast<ptrdiff_t>(y) * window_width * 2L;
+
+        f(row0, row1, window_row, scharr_window_row);
+      }
+    }
+  };
+
+  // This function is too complex, but disable the warning for now.
+  // NOLINTBEGIN(readability-function-cognitive-complexity)
+  static inline void get_sum_diff_scharr(
+      const uint8_t *next_data, ptrdiff_t next_stride, const int16_t *window,
+      const int16_t *scharr_window, int channels, int window_corner_x,
+      int window_corner_y, int window_width, int window_height,
+      int16_t coeff_tl, int16_t coeff_tr, int16_t coeff_bl, int16_t coeff_br,
+      float &sum_diff_scharr_x, float &sum_diff_scharr_y) {
+    OpticalFlowGetSumDiffScharrProcessRows process_rows = {
+        next_data,       next_stride,     window,       scharr_window, channels,
+        window_corner_x, window_corner_y, window_width, window_height};
+
+    sum_diff_scharr_x = 0;
+    sum_diff_scharr_y = 0;
+
+    float32x4_t sum_diff_scharr_x_v = vdupq_n_f32(0),
+                sum_diff_scharr_y_v = vdupq_n_f32(0);
+
+    const int16x4_t coeff_tl_v = vdup_n_s16(coeff_tl);
+    const int16x4_t coeff_tr_v = vdup_n_s16(coeff_tr);
+    const int16x4_t coeff_bl_v = vdup_n_s16(coeff_bl);
+    const int16x4_t coeff_br_v = vdup_n_s16(coeff_br);
+
+    auto make_leftover_accumulate_mask_s32 = [](int width) {
+      int count7 = width & 7;
+      int count3 = width & 3;
+      int count4 = count7 == 4 ? 4 : count3;
+      return vcombine_s32(vcreate_s32(0x00000000ffffffffULL * (count4 >= 4) +
+                                      0xffffffff00000000ULL * (count4 >= 3)),
+                          vcreate_s32(0x00000000ffffffffULL * (count4 >= 2) +
+                                      0xffffffff00000000ULL * (count4 >= 1)));
+    };
+
+    const int32x4_t leftover_mask =
+        make_leftover_accumulate_mask_s32(window_width * channels);
+
+    auto process8_and_accumulate = [&](const uint8_t *row0, const uint8_t *row1,
+                                       const int16_t *window_row,
+                                       const int16_t *scharr_window_row,
+                                       int x) {
+      int16x8_t tl = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(row0 + x)));
+      int16x8_t tr =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(row0 + x + channels)));
+      int16x8_t bl = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(row1 + x)));
+      int16x8_t br =
+          vreinterpretq_s16_u16(vmovl_u8(vld1_u8(row1 + x + channels)));
+
+      int16x4_t low = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+          vget_low_s16(tl), vget_low_s16(tr), vget_low_s16(bl),
+          vget_low_s16(br), coeff_tl_v, coeff_tr_v, coeff_bl_v, coeff_br_v);
+
+      int16x4_t high = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+          vget_high_s16(tl), vget_high_s16(tr), vget_high_s16(bl),
+          vget_high_s16(br), coeff_tl_v, coeff_tr_v, coeff_bl_v, coeff_br_v);
+
+      int16x8_t next = vcombine_s16(low, high);
+
+      int16x8_t diff = vsubq_s16(next, vld1q_s16(window_row + x));
+
+      int16x8x2_t scharr_xy = vld2q_s16(scharr_window_row + x * 2L);
+      int16x8_t scharr_x = scharr_xy.val[0];
+      int16x8_t scharr_y = scharr_xy.val[1];
+
+      int32x4_t diff_scharr_x =
+          vmlal_high_s16(vmull_s16(vget_low_s16(scharr_x), vget_low_s16(diff)),
+                         scharr_x, diff);
+      int32x4_t diff_scharr_y =
+          vmlal_high_s16(vmull_s16(vget_low_s16(scharr_y), vget_low_s16(diff)),
+                         scharr_y, diff);
+
+      sum_diff_scharr_x_v =
+          vaddq_f32(sum_diff_scharr_x_v, vcvtq_f32_s32(diff_scharr_x));
+      sum_diff_scharr_y_v =
+          vaddq_f32(sum_diff_scharr_y_v, vcvtq_f32_s32(diff_scharr_y));
+    };
+    auto process4 = [&](const uint8_t *row0, const uint8_t *row1,
+                        const int16_t *window_row,
+                        const int16_t *scharr_window_row, int x) {
+      uint16x4_t tl = vld1ub_s16(row0 + x);
+      uint16x4_t tr = vld1ub_s16(row0 + x + channels);
+      uint16x4_t bl = vld1ub_s16(row1 + x);
+      uint16x4_t br = vld1ub_s16(row1 + x + channels);
+
+      int16x4_t next = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+          tl, tr, bl, br, coeff_tl_v, coeff_tr_v, coeff_bl_v, coeff_br_v);
+
+      int16x4_t diff = vsub_s16(next, vld1_s16(window_row + x));
+
+      int16x4x2_t scharr_xy = vld2_s16(scharr_window_row + x * 2L);
+      int16x4_t scharr_x = scharr_xy.val[0];
+      int16x4_t scharr_y = scharr_xy.val[1];
+
+      int32x4_t diff_scharr_x = vmull_s16(scharr_x, diff);
+      int32x4_t diff_scharr_y = vmull_s16(scharr_y, diff);
+      return int32x4x2_t{diff_scharr_x, diff_scharr_y};
+    };
+    auto process4_and_accumulate =
+        [&](const uint8_t *row0, const uint8_t *row1, const int16_t *window_row,
+            const int16_t *scharr_window_row, int x) {
+          int32x4x2_t diff_scharr =
+              process4(row0, row1, window_row, scharr_window_row, x);
+          sum_diff_scharr_x_v =
+              vaddq_f32(sum_diff_scharr_x_v, vcvtq_f32_s32(diff_scharr.val[0]));
+          sum_diff_scharr_y_v =
+              vaddq_f32(sum_diff_scharr_y_v, vcvtq_f32_s32(diff_scharr.val[1]));
+        };
+    auto process4_and_accumulate_leftovers =
+        [&](const uint8_t *row0, const uint8_t *row1, const int16_t *window_row,
+            const int16_t *scharr_window_row) {
+          int32x4x2_t diff_scharr =
+              process4(row0, row1, window_row, scharr_window_row,
+                       window_width * channels - 4);
+          int32x4_t masked_x = vandq_s32(diff_scharr.val[0], leftover_mask);
+          int32x4_t masked_y = vandq_s32(diff_scharr.val[1], leftover_mask);
+          sum_diff_scharr_x_v =
+              vaddq_f32(sum_diff_scharr_x_v, vcvtq_f32_s32(masked_x));
+          sum_diff_scharr_y_v =
+              vaddq_f32(sum_diff_scharr_y_v, vcvtq_f32_s32(masked_y));
+        };
+    auto process1_and_accumulate = [&](const uint8_t *row0, const uint8_t *row1,
+                                       const int16_t *window_row,
+                                       const int16_t *scharr_window_row,
+                                       int x) {
+      int next =
+          round_fixed_point<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+              row0[x] * coeff_tl + row0[x + channels] * coeff_tr +
+              row1[x] * coeff_bl + row1[x + channels] * coeff_br);
+      int diff = next - window_row[x];
+      sum_diff_scharr_x += static_cast<float>(diff * scharr_window_row[x * 2L]);
+      sum_diff_scharr_y +=
+          static_cast<float>(diff * scharr_window_row[x * 2L + 1]);
+    };
+
+    if (window_width * channels < 8) {
+      // Handle small window width
+      if (window_width * channels > 4) {
+        process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                         const int16_t *window_row,
+                         const int16_t *scharr_window_row) {
+          process4_and_accumulate(row0, row1, window_row, scharr_window_row, 0);
+          process4_and_accumulate_leftovers(row0, row1, window_row,
+                                            scharr_window_row);
+        });
+      } else {
+        // Handle tiny window width
+        process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                         const int16_t *window_row,
+                         const int16_t *scharr_window_row) {
+          for (int x = 0; x < window_width * channels; ++x) {
+            process1_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                    x);
+          }
+        });
+      }
+    } else {
+      switch (window_width * channels & 7) {
+        case 1:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           const int16_t *window_row,
+                           const int16_t *scharr_window_row) {
+            int x = 0;
+            for (; x + 8 <= window_width * channels; x += 8) {
+              process8_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                      x);
+            }
+            process1_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                    x);
+          });
+          break;
+        case 5:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           const int16_t *window_row,
+                           const int16_t *scharr_window_row) {
+            int x = 0;
+            for (; x + 8 <= window_width * channels; x += 8) {
+              process8_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                      x);
+            }
+            process4_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                    x);
+            process1_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                    x + 4);
+          });
+          break;
+        case 6:
+        case 7:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           const int16_t *window_row,
+                           const int16_t *scharr_window_row) {
+            int x = 0;
+            for (; x + 8 <= window_width * channels; x += 8) {
+              process8_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                      x);
+            }
+            process4_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                    x);
+            process4_and_accumulate_leftovers(row0, row1, window_row,
+                                              scharr_window_row);
+          });
+          break;
+        default:
+          process_rows([&](const uint8_t *row0, const uint8_t *row1,
+                           const int16_t *window_row,
+                           const int16_t *scharr_window_row) {
+            for (int x = 0; x + 8 <= window_width * channels; x += 8) {
+              process8_and_accumulate(row0, row1, window_row, scharr_window_row,
+                                      x);
+            }
+            process4_and_accumulate_leftovers(row0, row1, window_row,
+                                              scharr_window_row);
+          });
+          break;
+      }
+    }
+
+    sum_diff_scharr_x += vaddvq_f32(sum_diff_scharr_x_v);
+    sum_diff_scharr_y += vaddvq_f32(sum_diff_scharr_y_v);
+    sum_diff_scharr_x *= KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+    sum_diff_scharr_y *= KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+  }
+  // NOLINTEND(readability-function-cognitive-complexity)
+};
+
+KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t optical_flow_u8(
+    const uint8_t *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uint8_t *next_data, size_t next_step, int width, int height,
+    int channels, const float *prev_points, float *next_points,
+    size_t point_count, uint8_t *status, float *err, int window_width,
+    int window_height, int termination_count, double termination_epsilon,
+    bool get_min_eigen_vals, float min_eigen_vals_threshold) {
+  OpticalFlowWindowBuffer window_buffer(window_width, window_height, channels);
+  if (!window_buffer.window()) {
+    return KLEIDICV_ERROR_ALLOCATION;
+  }
+  return optical_flow_common<OpticalFlow>(
+      window_buffer.window(), window_buffer.deriv_window(), prev_data,
+      prev_data_step, prev_deriv_data, prev_deriv_step, next_data, next_step,
+      width, height, channels, prev_points, next_points, point_count, status,
+      err, window_width, window_height, termination_count, termination_epsilon,
+      get_min_eigen_vals, min_eigen_vals_threshold);
+}
+
+}  // namespace kleidicv_opencv::neon
diff --git a/kleidicv_opencv/src/optical_flow/optical_flow_sc.h b/kleidicv_opencv/src/optical_flow/optical_flow_sc.h
new file mode 100644
index 000000000..040cc8ccd
--- /dev/null
+++ b/kleidicv_opencv/src/optical_flow/optical_flow_sc.h
@@ -0,0 +1,208 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef KLEIDICV_OPENCV_OPTICAL_FLOW_SC_H
+#define KLEIDICV_OPENCV_OPTICAL_FLOW_SC_H
+
+#include <cstddef>
+
+#include "kleidicv/sve2.h"
+#include "optical_flow_common.h"
+
+namespace KLEIDICV_OPENCV_TARGET_NAMESPACE {
+
+struct OpticalFlow {
+  template <uint64_t shift>
+  static inline svint16_t lerp(svint16_t tl, svint16_t tr, svint16_t bl,
+                               svint16_t br, int16_t coeff_tl, int16_t coeff_tr,
+                               int16_t coeff_bl,
+                               int16_t coeff_br) KLEIDICV_STREAMING_COMPATIBLE {
+    svint32_t b = svmullb_n_s32(tl, coeff_tl);
+    svint32_t t = svmullt_n_s32(tl, coeff_tl);
+
+    b = svmlalb_n_s32(b, tr, coeff_tr);
+    t = svmlalt_n_s32(t, tr, coeff_tr);
+
+    b = svmlalb_n_s32(b, bl, coeff_bl);
+    t = svmlalt_n_s32(t, bl, coeff_bl);
+
+    b = svmlalb_n_s32(b, br, coeff_br);
+    t = svmlalt_n_s32(t, br, coeff_br);
+
+    svint16_t result = svqrshrnb_n_s32(b, shift);
+    result = svqrshrnt_n_s32(result, t, shift);
+    return result;
+  }
+
+  static inline void get_window(
+      int16_t *window, const uint8_t *prev_data, ptrdiff_t prev_data_stride,
+      int channels, int window_corner_x, int window_corner_y, int window_width,
+      int window_height, int16_t coeff_tl, int16_t coeff_tr, int16_t coeff_bl,
+      int16_t coeff_br) KLEIDICV_STREAMING_COMPATIBLE {
+    for (int y = 0; y < window_height; y++) {
+      const uint8_t *const prev_row0 =
+          prev_data + (y + window_corner_y) * prev_data_stride +
+          static_cast<ptrdiff_t>(window_corner_x) * channels;
+      const uint8_t *const prev_row1 = prev_row0 + prev_data_stride;
+
+      int16_t *const window_row =
+          window + static_cast<ptrdiff_t>(y) * window_width;
+
+      for (size_t x = 0; x < static_cast<size_t>(window_width) * channels;
+           x += svcnth()) {
+        svbool_t pg16 =
+            svwhilelt_b16(x, static_cast<size_t>(window_width) * channels);
+
+        svint16_t prev_tl = svld1ub_s16(pg16, prev_row0 + x);
+        svint16_t prev_tr = svld1ub_s16(pg16, prev_row0 + x + channels);
+        svint16_t prev_bl = svld1ub_s16(pg16, prev_row1 + x);
+        svint16_t prev_br = svld1ub_s16(pg16, prev_row1 + x + channels);
+
+        svint16_t prev = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+            prev_tl, prev_tr, prev_bl, prev_br, coeff_tl, coeff_tr, coeff_bl,
+            coeff_br);
+        svst1_s16(pg16, window_row + x, prev);
+      }
+    }
+  }
+
+  static inline void get_scharr(
+      int16_t *scharr_window, const int16_t *scharr_data,
+      ptrdiff_t scharr_stride_elements, int channels, int window_corner_x,
+      int window_corner_y, int window_width, int window_height,
+      int16_t coeff_tl, int16_t coeff_tr, int16_t coeff_bl, int16_t coeff_br,
+      float &sum_scharr_xx, float &sum_scharr_xy,
+      float &sum_scharr_yy) KLEIDICV_STREAMING_COMPATIBLE {
+    svfloat32_t sum_scharr_xx_v = svdup_n_f32(0),
+                sum_scharr_xy_v = svdup_n_f32(0),
+                sum_scharr_yy_v = svdup_n_f32(0);
+    for (int y = 0; y < window_height; y++) {
+      const int16_t *const scharr_row0 =
+          scharr_data + (y + window_corner_y) * scharr_stride_elements +
+          window_corner_x * 2L * channels;
+      const int16_t *const scharr_row1 = scharr_row0 + scharr_stride_elements;
+
+      int16_t *const scharr_window_row = scharr_window + y * 2L * window_width;
+
+      for (size_t x = 0; x < static_cast<size_t>(window_width) * channels;
+           x += svcnth()) {
+        svbool_t pg16 =
+            svwhilelt_b16(x, static_cast<size_t>(window_width) * channels);
+        svbool_t pg32 =
+            svwhilelt_b32(x, static_cast<size_t>(window_width) * channels);
+
+        svint16x2_t scharr_tl = svld2_s16(pg16, scharr_row0 + x * 2L);
+        svint16x2_t scharr_tr =
+            svld2_s16(pg16, scharr_row0 + (x + channels) * 2L);
+        svint16x2_t scharr_bl = svld2_s16(pg16, scharr_row1 + x * 2L);
+        svint16x2_t scharr_br =
+            svld2_s16(pg16, scharr_row1 + (x + channels) * 2L);
+
+        svint16_t scharr_x = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+            svget2(scharr_tl, 0), svget2(scharr_tr, 0), svget2(scharr_bl, 0),
+            svget2(scharr_br, 0), coeff_tl, coeff_tr, coeff_bl, coeff_br);
+
+        svint16_t scharr_y = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS>(
+            svget2(scharr_tl, 1), svget2(scharr_tr, 1), svget2(scharr_bl, 1),
+            svget2(scharr_br, 1), coeff_tl, coeff_tr, coeff_bl, coeff_br);
+
+        svst2_s16(pg16, scharr_window_row + x * 2L,
+                  svcreate2_s16(scharr_x, scharr_y));
+
+        // sum_scharr_xx += scharr_x * scharr_x;
+        sum_scharr_xx_v =
+            svadd_f32_m(pg32, sum_scharr_xx_v,
+                        svcvt_f32_s32_x(pg32, svmullb_s32(scharr_x, scharr_x)));
+        sum_scharr_xx_v =
+            svadd_f32_m(pg32, sum_scharr_xx_v,
+                        svcvt_f32_s32_x(pg32, svmullt_s32(scharr_x, scharr_x)));
+
+        // sum_scharr_xy += scharr_x * scharr_y;
+        sum_scharr_xy_v =
+            svadd_f32_m(pg32, sum_scharr_xy_v,
+                        svcvt_f32_s32_x(pg32, svmullb_s32(scharr_x, scharr_y)));
+        sum_scharr_xy_v =
+            svadd_f32_m(pg32, sum_scharr_xy_v,
+                        svcvt_f32_s32_x(pg32, svmullt_s32(scharr_x, scharr_y)));
+
+        // sum_scharr_yy += scharr_y * scharr_y;
+        sum_scharr_yy_v =
+            svadd_f32_m(pg32, sum_scharr_yy_v,
+                        svcvt_f32_s32_x(pg32, svmullb_s32(scharr_y, scharr_y)));
+        sum_scharr_yy_v =
+            svadd_f32_m(pg32, sum_scharr_yy_v,
+                        svcvt_f32_s32_x(pg32, svmullt_s32(scharr_y, scharr_y)));
+      }
+    }
+    sum_scharr_xx = svaddv_f32(svptrue_b32(), sum_scharr_xx_v) *
+                    KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+    sum_scharr_xy = svaddv_f32(svptrue_b32(), sum_scharr_xy_v) *
+                    KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+    sum_scharr_yy = svaddv_f32(svptrue_b32(), sum_scharr_yy_v) *
+                    KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+  }
+
+  static inline void get_sum_diff_scharr(
+      const uint8_t *next_data, ptrdiff_t next_stride, const int16_t *window,
+      const int16_t *scharr_window, int channels, int window_corner_x,
+      int window_corner_y, int window_width, int window_height,
+      int16_t coeff_tl, int16_t coeff_tr, int16_t coeff_bl, int16_t coeff_br,
+      float &sum_diff_scharr_x,
+      float &sum_diff_scharr_y) KLEIDICV_STREAMING_COMPATIBLE {
+    svfloat32_t sum_diff_scharr_x_v = svdup_n_f32(0),
+                sum_diff_scharr_y_v = svdup_n_f32(0);
+
+    for (int y = 0; y < window_height; y++) {
+      const uint8_t *next_row0 =
+          next_data + (y + window_corner_y) * next_stride +
+          static_cast<ptrdiff_t>(window_corner_x) * channels;
+      const uint8_t *next_row1 = next_row0 + next_stride;
+      const int16_t *window_row =
+          window + static_cast<ptrdiff_t>(y) * window_width;
+      const int16_t *scharr_window_row = scharr_window + y * 2L * window_width;
+
+      for (size_t x = 0; x < static_cast<size_t>(window_width) * channels;
+           x += svcnth()) {
+        svbool_t pg16 =
+            svwhilelt_b16(x, static_cast<size_t>(window_width) * channels);
+        svbool_t pg32 =
+            svwhilelt_b32(x, static_cast<size_t>(window_width) * channels);
+
+        svint16_t tl = svld1ub_s16(pg16, next_row0 + x);
+        svint16_t tr = svld1ub_s16(pg16, next_row0 + x + channels);
+        svint16_t bl = svld1ub_s16(pg16, next_row1 + x);
+        svint16_t br = svld1ub_s16(pg16, next_row1 + x + channels);
+        svint16_t diff = lerp<KLEIDICV_OPENCV_OPTICAL_FLOW_FRACTION_BITS - 5>(
+            tl, tr, bl, br, coeff_tl, coeff_tr, coeff_bl, coeff_br);
+        svint16_t win = svld1_s16(pg16, window_row + x);
+        diff = svsub_s16_x(pg16, diff, win);
+
+        svint16x2_t scharr_xy = svld2_s16(pg16, scharr_window_row + x * 2L);
+        svint16_t scharr_x = svget2(scharr_xy, 0);
+        svint16_t scharr_y = svget2(scharr_xy, 1);
+
+        sum_diff_scharr_x_v =
+            svadd_f32_m(pg32, sum_diff_scharr_x_v,
+                        svcvt_f32_s32_x(pg32, svmullb_s32(scharr_x, diff)));
+        sum_diff_scharr_x_v =
+            svadd_f32_m(pg32, sum_diff_scharr_x_v,
+                        svcvt_f32_s32_x(pg32, svmullt_s32(scharr_x, diff)));
+        sum_diff_scharr_y_v =
+            svadd_f32_m(pg32, sum_diff_scharr_y_v,
+                        svcvt_f32_s32_x(pg32, svmullb_s32(scharr_y, diff)));
+        sum_diff_scharr_y_v =
+            svadd_f32_m(pg32, sum_diff_scharr_y_v,
+                        svcvt_f32_s32_x(pg32, svmullt_s32(scharr_y, diff)));
+      }
+    }
+    sum_diff_scharr_x = svaddv_f32(svptrue_b32(), sum_diff_scharr_x_v) *
+                        KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+    sum_diff_scharr_y = svaddv_f32(svptrue_b32(), sum_diff_scharr_y_v) *
+                        KLEIDICV_OPENCV_OPTICAL_FLOW_FIXED_POINT_DESCALE;
+  }
+};
+
+}  // namespace KLEIDICV_OPENCV_TARGET_NAMESPACE
+
+#endif  // KLEIDICV_OPENCV_OPTICAL_FLOW_SC_H
diff --git a/kleidicv_opencv/src/optical_flow/optical_flow_sme2.cpp b/kleidicv_opencv/src/optical_flow/optical_flow_sme2.cpp
new file mode 100644
index 000000000..ffc807947
--- /dev/null
+++ b/kleidicv_opencv/src/optical_flow/optical_flow_sme2.cpp
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv_opencv/optical_flow.h"
+#include "optical_flow_sc.h"
+
+namespace kleidicv_opencv::sme2 {
+
+KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t
+optical_flow_u8_streaming(int16_t *window, int16_t *deriv_window,
+                          const uint8_t *prev_data, size_t prev_data_step,
+                          const int16_t *prev_deriv_data,
+                          size_t prev_deriv_step, const uint8_t *next_data,
+                          size_t next_step, int width, int height, int channels,
+                          const float *prev_points, float *next_points,
+                          size_t point_count, uint8_t *status, float *err,
+                          int window_width, int window_height,
+                          int termination_count, double termination_epsilon,
+                          bool get_min_eigen_vals,
+                          float min_eigen_vals_threshold) {
+  return optical_flow_common<OpticalFlow>(
+      window, deriv_window, prev_data, prev_data_step, prev_deriv_data,
+      prev_deriv_step, next_data, next_step, width, height, channels,
+      prev_points, next_points, point_count, status, err, window_width,
+      window_height, termination_count, termination_epsilon, get_min_eigen_vals,
+      min_eigen_vals_threshold);
+}
+
+KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t optical_flow_u8(
+    const uint8_t *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uint8_t *next_data, size_t next_step, int width, int height,
+    int channels, const float *prev_points, float *next_points,
+    size_t point_count, uint8_t *status, float *err, int window_width,
+    int window_height, int termination_count, double termination_epsilon,
+    bool get_min_eigen_vals, float min_eigen_vals_threshold) {
+  // When targetting SME, always allocate on the heap to avoid the memory page
+  // being written by both Streaming Mode Compute Unit and CPU.
+  constexpr int kStackBufferSize = 0;
+  OpticalFlowWindowBuffer<kStackBufferSize> window_buffer(
+      window_width, window_height, channels);
+  if (!window_buffer.window()) {
+    return KLEIDICV_ERROR_ALLOCATION;
+  }
+  return optical_flow_u8_streaming(
+      window_buffer.window(), window_buffer.deriv_window(), prev_data,
+      prev_data_step, prev_deriv_data, prev_deriv_step, next_data, next_step,
+      width, height, channels, prev_points, next_points, point_count, status,
+      err, window_width, window_height, termination_count, termination_epsilon,
+      get_min_eigen_vals, min_eigen_vals_threshold);
+}
+
+}  // namespace kleidicv_opencv::sme2
diff --git a/kleidicv_opencv/src/optical_flow/optical_flow_sve2.cpp b/kleidicv_opencv/src/optical_flow/optical_flow_sve2.cpp
new file mode 100644
index 000000000..b4198693a
--- /dev/null
+++ b/kleidicv_opencv/src/optical_flow/optical_flow_sve2.cpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kleidicv_opencv/optical_flow.h"
+#include "optical_flow_sc.h"
+
+namespace kleidicv_opencv::sve2 {
+
+KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t optical_flow_u8(
+    const uint8_t *prev_data, size_t prev_data_step,
+    const int16_t *prev_deriv_data, size_t prev_deriv_step,
+    const uint8_t *next_data, size_t next_step, int width, int height,
+    int channels, const float *prev_points, float *next_points,
+    size_t point_count, uint8_t *status, float *err, int window_width,
+    int window_height, int termination_count, double termination_epsilon,
+    bool get_min_eigen_vals, float min_eigen_vals_threshold) {
+  OpticalFlowWindowBuffer window_buffer(window_width, window_height, channels);
+  if (!window_buffer.window()) {
+    return KLEIDICV_ERROR_ALLOCATION;
+  }
+  return optical_flow_common<OpticalFlow>(
+      window_buffer.window(), window_buffer.deriv_window(), prev_data,
+      prev_data_step, prev_deriv_data, prev_deriv_step, next_data, next_step,
+      width, height, channels, prev_points, next_points, point_count, status,
+      err, window_width, window_height, termination_count, termination_epsilon,
+      get_min_eigen_vals, min_eigen_vals_threshold);
+}
+
+}  // namespace kleidicv_opencv::sve2
diff --git a/scripts/cpplint.sh b/scripts/cpplint.sh
index dc36ca41f..4a37490da 100755
--- a/scripts/cpplint.sh
+++ b/scripts/cpplint.sh
@@ -34,5 +34,5 @@ cpplint \
     --recursive \
     --exclude=build \
     --counting=detailed \
-    --filter=-build/c++11,-build/header_guard,-build/include_subdir,-readability/todo,-runtime/references,-whitespace/indent,-whitespace/line_length \
+    --filter=-build/c++11,-build/header_guard,-build/include_subdir,-build/namespaces_headers,-readability/todo,-runtime/references,-whitespace/indent,-whitespace/line_length \
     .
diff --git a/scripts/format.sh b/scripts/format.sh
index 121a43ca2..28178bdd3 100755
--- a/scripts/format.sh
+++ b/scripts/format.sh
@@ -32,6 +32,7 @@ SOURCES="$(find \
     "${KLEIDICV_ROOT_PATH}/adapters" \
     "${KLEIDICV_ROOT_PATH}/benchmark" \
     "${KLEIDICV_ROOT_PATH}/kleidicv" \
+    "${KLEIDICV_ROOT_PATH}/kleidicv_opencv" \
     "${KLEIDICV_ROOT_PATH}/kleidicv_thread" \
     "${KLEIDICV_ROOT_PATH}/test" \
     "${KLEIDICV_ROOT_PATH}/conformity/opencv" \
diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt
index aa30d3afd..529c2ebcb 100644
--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
@@ -25,6 +25,7 @@ target_include_directories(
   kleidicv-api-test
   PRIVATE ${KLEIDICV_INCLUDE_DIR}
   PRIVATE ${KLEIDICV_TEST_INCLUDE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../kleidicv_opencv/include
   ${CMAKE_CURRENT_SOURCE_DIR}/../../kleidicv_thread/include
 )
 
@@ -39,6 +40,7 @@ endif()
 target_link_libraries(
   kleidicv-api-test
   kleidicv
+  kleidicv_opencv
   kleidicv_thread
   gtest
   gmock
diff --git a/test/api/test_optical_flow.cpp b/test/api/test_optical_flow.cpp
new file mode 100644
index 000000000..cc104c0b2
--- /dev/null
+++ b/test/api/test_optical_flow.cpp
@@ -0,0 +1,728 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "framework/array.h"
+#include "framework/utils.h"
+#include "kleidicv/utils.h"
+#include "kleidicv_opencv/kleidicv_opencv.h"
+#include "test_config.h"
+
+const int DEFAULT_WINDOW_SIZE = 5;
+const int DEFAULT_TERMINATION_COUNT = 30;
+const float DEFAULT_MAX_ERROR = 0.2F;
+const float DEFAULT_TERMINATION_EPSILON = 0.01F;
+const float DEFAULT_MIN_EIGEN_VALS_THRESHOLD = 0.0001F;
+
+struct OpticalFlowTestParams {
+  std::vector<std::vector<uint8_t>> prev_image, next_image;
+  std::vector<float> prev_points, next_points;
+  std::vector<uint8_t> status;
+  std::vector<float> err, eigenvals;
+  int window_size = DEFAULT_WINDOW_SIZE;
+  int termination_count = DEFAULT_TERMINATION_COUNT;
+  float termination_epsilon = DEFAULT_TERMINATION_EPSILON;
+  float min_eigen_vals_threshold = DEFAULT_MIN_EIGEN_VALS_THRESHOLD;
+  float max_error = DEFAULT_MAX_ERROR;
+  friend void PrintTo(const OpticalFlowTestParams &v, std::ostream *os) {
+    *os << "([\n";
+    for (size_t y = 0; y < v.next_image.size(); ++y) {
+      const auto &row = v.next_image[y];
+      *os << "  [";
+      for (size_t x = 0; x < row.size(); ++x) {
+        *os << std::setw(3) << unsigned{row[x]};
+        if (x + 1 != row.size()) {
+          *os << ", ";
+        }
+      }
+      *os << "]";
+      if (y + 1 != v.next_image.size()) {
+        *os << ",\n";
+      }
+    }
+    *os << "], " << v.window_size << ")";
+  }
+};
+
+// Like kleidicv_scharr_interleaved_s16_u8 but handles edge pixels
+static void scharr_xy(const uint8_t *src, size_t src_stride, int16_t *dst,
+                      size_t dst_stride, size_t width, size_t height) {
+  for (size_t y = 0; y < height; y++) {
+    const uint8_t *src_0 = src;
+    if (y > 0) {
+      src_0 += src_stride * (y - 1);
+    } else if (height > 1) {
+      src_0 += src_stride;
+    }
+
+    const uint8_t *src_1 = src + src_stride * y;
+
+    const uint8_t *src_2 = src;
+    if (y + 1 < height) {
+      src_2 += src_stride * (y + 1);
+    } else if (height > 1) {
+      src_2 += src_stride * (height - 2);
+    }
+
+    int16_t *dst_row = dst + y * dst_stride / sizeof(int16_t);
+    for (size_t x = 0; x < width; ++x) {
+      size_t x0 = 0;
+      if (x > 0) {
+        x0 = x - 1;
+      } else if (width > 1) {
+        x0 = 1;
+      }
+
+      size_t x2 = 0;
+      if (x + 1 < width) {
+        x2 = x + 1;
+      } else if (width > 1) {
+        x2 = width - 2;
+      }
+      dst_row[x * 2] = static_cast<int16_t>(
+          (src_0[x2] + src_2[x2] - src_0[x0] - src_2[x0]) * 3 +
+          (src_1[x2] - src_1[x0]) * 10);
+      dst_row[x * 2 + 1] = static_cast<int16_t>(
+          (src_2[x0] + src_2[x2] - src_0[x0] - src_0[x2]) * 3 +
+          (src_2[x] - src_0[x]) * 10);
+    }
+  }
+}
+
+template <typename T>
+static std::vector<T> flatten_vector_of_vectors(
+    const std::vector<std::vector<T>> &v) {
+  std::vector<T> result;
+  for (const auto &row : v) {
+    result.insert(result.end(), row.begin(), row.end());
+  }
+  return result;
+}
+
+static ptrdiff_t border_reverse(ptrdiff_t x, ptrdiff_t width) {
+  if (width < 2) {
+    return 0;
+  }
+  x = abs(x);
+  x %= width * 2 - 2;
+  x -= width - 1;
+  x = abs(x);
+  x = width - x - 1;
+  return x;
+}
+
+template <typename T>
+static std::vector<T> pad_image_border_reverse(const T *src, size_t width,
+                                               size_t height, size_t pad_left,
+                                               size_t pad_right, size_t pad_y) {
+  size_t dst_stride = width + pad_left + pad_right;
+  std::vector<T> dst(dst_stride * (height + pad_y * 2));
+  for (size_t y = 0; y < height + pad_y * 2; ++y) {
+    ptrdiff_t src_y = border_reverse(static_cast<ptrdiff_t>(y - pad_y),
+                                     static_cast<ptrdiff_t>(height));
+    const T *src_row = src + src_y * width;
+
+    T *dst_row = dst.data() + y * dst_stride;
+
+    for (size_t x = 0; x < dst_stride; ++x) {
+      dst_row[x] = src_row[border_reverse(static_cast<ptrdiff_t>(x - pad_left),
+                                          static_cast<ptrdiff_t>(width))];
+    }
+  }
+  return dst;
+}
+
+static void extend_images_and_test(
+    const uint8_t *prev_image, const uint8_t *next_image, size_t width,
+    size_t height, size_t prev_image_pad_right, size_t next_image_pad_right,
+    size_t scharr_pad_right, size_t window_width, size_t window_height,
+    const std::vector<float> &prev_points,
+    const std::vector<float> &next_points, const std::vector<uint8_t> &status,
+    const std::vector<float> &error, const std::vector<float> &eigenvals,
+    float max_error, int termination_count, float termination_epsilon,
+    float min_eigen_vals_threshold) {
+  const size_t prev_image_stride =
+      width + window_width * 2 + prev_image_pad_right;
+  const size_t next_image_stride =
+      width + window_width * 2 + next_image_pad_right;
+  const size_t scharr_element_stride =
+      (width + window_width * 2) * 2 + scharr_pad_right;
+  const size_t point_count = prev_points.size() / 2;
+
+  // In opencv/modules/video/src/lkpyramid.cpp criteria.epsilon is multiplied by
+  // itself before it reaches the HAL.
+  termination_epsilon *= termination_epsilon;
+
+  ASSERT_EQ(prev_points.size(), next_points.size());
+
+  std::vector<uint8_t> padded_prev_image = pad_image_border_reverse(
+      prev_image, width, height, window_width,
+      window_width + prev_image_pad_right, window_height);
+  std::vector<uint8_t> padded_next_image = pad_image_border_reverse(
+      next_image, width, height, window_width,
+      window_width + next_image_pad_right, window_height);
+
+  std::vector<int16_t> scharr(scharr_element_stride *
+                              (height + window_height * 2));
+  scharr_xy(
+      padded_prev_image.data() + prev_image_stride * window_height +
+          window_width,
+      prev_image_stride,
+      scharr.data() + window_height * scharr_element_stride + window_width,
+      scharr_element_stride * sizeof(int16_t), width, height);
+
+  const std::vector<uint8_t> empty_status;
+  const std::vector<float> empty_error;
+
+  for (const auto &[expected_status, expected_err, get_min_eigen_vals] : {
+           std::make_tuple(status, error, false),
+           std::make_tuple(status, eigenvals, true),
+           std::make_tuple(status, empty_error, false),
+           std::make_tuple(empty_status, empty_error, false),
+           std::make_tuple(empty_status, eigenvals, true),
+       }) {
+    std::vector<float> actual_next_points(prev_points);
+    std::vector<uint8_t> actual_status(expected_status.size(), 1);
+    std::vector<float> actual_err(expected_err.size(), -1);
+
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_opencv_optical_flow_u8(
+            padded_prev_image.data() + prev_image_stride * window_height +
+                window_width,
+            prev_image_stride,
+            scharr.data() + scharr_element_stride * window_height +
+                window_width,
+            scharr_element_stride * sizeof(int16_t),
+            padded_next_image.data() + next_image_stride * window_height +
+                window_width,
+            next_image_stride, width, height, 1 /*channels*/,
+            prev_points.data(), actual_next_points.data(), point_count,
+            actual_status.empty() ? nullptr : actual_status.data(),
+            actual_err.empty() ? nullptr : actual_err.data(), window_width,
+            window_height, termination_count, termination_epsilon,
+            get_min_eigen_vals, min_eigen_vals_threshold));
+
+    ASSERT_THAT(actual_status, ::testing::ElementsAreArray(expected_status));
+    EXPECT_THAT(
+        actual_next_points,
+        ::testing::Pointwise(::testing::FloatNear(max_error), next_points));
+    EXPECT_THAT(actual_err, ::testing::Pointwise(::testing::FloatNear(0.001F),
+                                                 expected_err));
+  }
+}
+
+// If set, expect output to almost exactly match results from OpenCV.
+// Otherwise results should approximately match the theoretically correct value.
+// The theoretically correct value is calculated according to the positions of
+// the geometry that was rendered (e.g. the corner of a square). It is not
+// expected that any optical flow algorithm will be able to calculate such
+// values accurately from the rendered image so the margin of error is large.
+#define EXPECT_OPENCV 1
+
+class OpticalFlow : public testing::TestWithParam<OpticalFlowTestParams> {
+ public:
+  void do_test(size_t prev_image_pad_right, size_t next_image_pad_right,
+               size_t scharr_pad_right) const {
+    const size_t height = GetParam().prev_image.size();
+    const size_t width = GetParam().prev_image.at(0).size();
+    const size_t window_width = GetParam().window_size;
+    const size_t window_height = GetParam().window_size;
+
+    ASSERT_EQ(height, GetParam().next_image.size());
+    for (const auto &row : GetParam().prev_image) {
+      ASSERT_EQ(width, row.size());
+    }
+    for (const auto &row : GetParam().next_image) {
+      ASSERT_EQ(width, row.size());
+    }
+
+    const std::vector<uint8_t> prev_image =
+        flatten_vector_of_vectors(GetParam().prev_image);
+    const std::vector<uint8_t> next_image =
+        flatten_vector_of_vectors(GetParam().next_image);
+
+#if EXPECT_OPENCV
+    const float max_error = 0.001F;
+#else
+    const float max_error = GetParam().max_error;
+#endif
+
+    extend_images_and_test(
+        prev_image.data(), next_image.data(), width, height,
+        prev_image_pad_right, next_image_pad_right, scharr_pad_right,
+        window_width, window_height, GetParam().prev_points,
+        GetParam().next_points, GetParam().status, GetParam().err,
+        GetParam().eigenvals, max_error, GetParam().termination_count,
+        GetParam().termination_epsilon, GetParam().min_eigen_vals_threshold);
+  }
+};
+
+TEST_P(OpticalFlow, NoPadding) { do_test(0, 0, 0); }
+TEST_P(OpticalFlow, PadPrev) { do_test(1, 0, 0); }
+TEST_P(OpticalFlow, PadNext) { do_test(0, 1, 0); }
+TEST_P(OpticalFlow, PadDeriv) { do_test(0, 0, 1); }
+
+static std::vector<std::vector<uint8_t>> make_image(size_t width, size_t height,
+                                                    uint8_t gray) {
+  return std::vector<std::vector<uint8_t>>(height,
+                                           std::vector<uint8_t>(width, gray));
+}
+
+static std::vector<std::vector<uint8_t>> draw_rect_on_image(
+    std::vector<std::vector<uint8_t>> &&image, size_t x, size_t y, size_t width,
+    size_t height, uint8_t gray) {
+  for (size_t i = y; i < y + height; ++i) {
+    for (size_t j = x; j < x + width; ++j) {
+      image[i][j] = gray;
+    }
+  }
+  return std::move(image);
+}
+
+static std::vector<std::vector<uint8_t>> make_corner_image(size_t width,
+                                                           size_t height,
+                                                           size_t x, size_t y,
+                                                           uint8_t background,
+                                                           uint8_t foreground) {
+  return draw_rect_on_image(make_image(width, height, background), 0, 0, x, y,
+                            foreground);
+}
+
+// Disable check for possible exceptions thrown outside main.
+// The check isn't important in a test program.
+// NOLINTBEGIN(cert-err58-cpp)
+
+// 4x4 square rotated by 15 degrees
+static const std::vector<std::vector<uint8_t>> square_rotated_by_15_degrees = {
+    // clang-format off
+    {0,   0,   0,   0,   0,   0,   0,   0},
+    {0,   0,   0,   0,  56,  49,   0,   0},
+    {0,  49, 167, 239, 255, 167,   0,   0},
+    {0,  56, 255, 255, 255, 231,   0,   0},
+    {0,   0, 239, 255, 255, 255,  56,   0},
+    {0,   0, 167, 255, 231, 167,  49,   0},
+    {0,   0,  49,  56,   0,   0,   0,   0},
+    {0,   0,   0,   0,   0,   0,   0,   0},
+    // clang-format on
+};
+
+// 4x4 square rotated by 30 degrees and offset
+static const std::vector<std::vector<uint8_t>> square_rotated_by_30_degrees = {
+    // clang-format off
+    {  0,   0,  24, 123,   0,   0,   0,   0},
+    {  0, 112, 239, 255, 112,   0,   0,   0},
+    {123, 255, 255, 255, 239,  24,   0,   0},
+    { 24, 239, 255, 255, 255, 123,   0,   0},
+    {  0, 112, 255, 239, 112,   0,   0,   0},
+    {  0,   0, 123,  24,   0,   0,   0,   0},
+    {  0,   0,   0,   0,   0,   0,   0,   0},
+    {  0,   0,   0,   0,   0,   0,   0,   0},
+    // clang-format on
+};
+
+// NOLINTEND(cert-err58-cpp)
+
+static OpticalFlowTestParams make_rotating_square_test(
+    int window_size,
+    [[maybe_unused]] const std::vector<float> &opencv_next_points,
+    const std::vector<float> &err, const std::vector<float> &eigenvals) {
+  float max_error = 1.0F;
+  return OpticalFlowTestParams{square_rotated_by_15_degrees,
+                               square_rotated_by_30_degrees,
+                               // bottom-right and bottom-left corners
+                               {5.95F, 4.91F, 2.09F, 5.95F},
+#if EXPECT_OPENCV
+                               opencv_next_points,
+#else
+                               {5.23F, 3.23F, 1.77F, 5.23F},
+#endif
+                               {1, 1},
+                               err,
+                               eigenvals,
+                               window_size,
+                               DEFAULT_TERMINATION_COUNT,
+                               DEFAULT_TERMINATION_EPSILON,
+                               DEFAULT_MIN_EIGEN_VALS_THRESHOLD,
+                               max_error};
+}
+
+using P = OpticalFlowTestParams;
+
+INSTANTIATE_TEST_SUITE_P(
+    , OpticalFlow,
+    testing::Values(
+        // Bottom-right corner no movement
+        P{make_corner_image(4, 4, 2, 2, 0, 9),
+          make_corner_image(4, 4, 2, 2, 0, 9),
+          {2.0F, 2.0F},
+          {2.0F, 2.0F},
+          {1},
+          {0.0F},
+          {0.00189F}},
+
+        // Bottom-right corner moving one pixel right
+        P{make_corner_image(4, 4, 2, 2, 0, 9),
+          make_corner_image(4, 4, 3, 2, 0, 9),
+          {2.0F, 2.0F},
+#if EXPECT_OPENCV
+          {3.10952997F, 1.96065569F},
+#else
+          {3.0F, 2.0F},
+#endif
+          {1},
+          {1.655F},
+          {0.00189F}},
+
+        // Bottom-right corner moving one pixel down
+        P{make_corner_image(4, 4, 2, 2, 0, 9),
+          make_corner_image(4, 4, 2, 3, 0, 9),
+          {2.0F, 2.0F},
+#if EXPECT_OPENCV
+          {1.96065569F, 3.10952997F},
+#else
+          {2.0F, 3.0F},
+#endif
+          {1},
+          {1.655F},
+          {0.00189F}},
+
+        // Bottom-right corner moving one pixel down and one pixel right
+        P{make_corner_image(4, 4, 2, 2, 0, 9),
+          make_corner_image(4, 4, 3, 3, 0, 9),
+          {2.0F, 2.0F},
+#if EXPECT_OPENCV
+          {2.99960446F, 2.99960446F},
+#else
+          {3.0F, 3.0F},
+#endif
+          {1},
+          {4.32F},
+          {0.00189F}},
+
+        // Bottom-right corner moving half a pixel right
+        P{make_corner_image(4, 4, 2, 2, 0, 255),
+          draw_rect_on_image(make_corner_image(4, 4, 2, 2, 0, 255), 2, 0, 1, 2,
+                             127),
+          {2.0F, 2.0F},
+#if EXPECT_OPENCV
+          {2.5086112F, 1.902421F},
+#else
+          {2.5F, 2.0F},
+#endif
+          {1},
+          {33.7175F},
+          {1.51807F}},
+
+        // Some points out of bounds
+        P{make_corner_image(4, 4, 2, 2, 0, 9),
+          make_corner_image(4, 4, 2, 2, 0, 9),
+          {2, 2, -6, 1, 1, -6, 10, 1, 1, 10},
+          {2, 2, -6, 1, 1, -6, 10, 1, 1, 10},
+          {1, 0, 0, 0, 0},
+          {0, -1, -1, -1, -1},
+          {0.00189F, -1, -1, -1, -1}},
+
+        // Single pixel image
+        P{{{0}}, {{0}}, {0.5F, 0.5F}, {0.5F, 0.5F}, {0}, {-1}, {0}},
+
+        P{{{0}},
+          {{0}},
+          {0.5F, 0.5F},
+          {0.5F, 0.5F},
+          {0},
+          {-1},
+          {0},
+          DEFAULT_WINDOW_SIZE,
+          DEFAULT_TERMINATION_COUNT,
+          DEFAULT_TERMINATION_EPSILON,
+          DEFAULT_MIN_EIGEN_VALS_THRESHOLD,
+          DEFAULT_MAX_ERROR},
+
+        // Edge case of termination count 0
+        P{{{0}},
+          {{0}},
+          {0.5F, 0.5F},
+          {0.5F, 0.5F},
+          {0},
+          {-1},
+          {0},
+          DEFAULT_WINDOW_SIZE,
+          0 /*termination_count*/,
+          DEFAULT_TERMINATION_EPSILON,
+          DEFAULT_MIN_EIGEN_VALS_THRESHOLD,
+          DEFAULT_MAX_ERROR},
+
+        // Edge case of negative termination epsilon
+        P{{{0}},
+          {{0}},
+          {0.5F, 0.5F},
+          {0.5F, 0.5F},
+          {0},
+          {-1},
+          {0},
+          DEFAULT_WINDOW_SIZE,
+          DEFAULT_TERMINATION_COUNT,
+          -1 /*termination_epsilon*/,
+          DEFAULT_MIN_EIGEN_VALS_THRESHOLD,
+          DEFAULT_MAX_ERROR},
+
+        // Edge case of min eigen vals threshold 0
+        P{{{0}},
+          {{0}},
+          {0.5F, 0.5F},
+          {0.5F, 0.5F},
+          {0},
+          {-1},
+          {0},
+          DEFAULT_WINDOW_SIZE,
+          DEFAULT_TERMINATION_COUNT,
+          DEFAULT_TERMINATION_EPSILON,
+          0 /*min_eigen_vals_threshold*/,
+          DEFAULT_MAX_ERROR},
+
+        // Triggers a projected point going out of bounds
+        P{{{201, 181, 4}, {64, 133, 70}, {35, 133, 127}},
+          {{64, 242, 103}, {5, 88, 143}, {139, 219, 107}},
+          {2.23284149F, 0.747504354F},
+          {2.9699F, -2.02478F},
+          {0},
+          {-1},
+          {0.03813F},
+          3},
+
+        // Triggers a projected point going out of bounds on the last iteration
+        P{{{249, 6, 81}, {20, 242, 34}, {133, 143, 224}},
+          {{15, 12, 160}, {150, 194, 117}, {11, 82, 246}},
+          {1.85238028F, 0.249280542F},
+          {4.38032F, -0.25043F},
+          {0},
+          {-1},
+          {0.13125F},
+          3},
+
+        // Triggers successful termination due to low "velocity"
+        P{{{110, 23, 46}, {72, 228, 77}, {6, 87, 244}},
+          {{246, 50, 100}, {227, 18, 190}, {18, 168, 134}},
+          {0.967025876F, 1.84503651F},
+          {1.64828F, 2.58103F},
+          {1},
+          {57.18056F},
+          {0.42731F},
+          3},
+
+        make_rotating_square_test(3, {5.03495F, 3.60691F, 1.39537F, 5.03591F},
+                                  {10.05208F, 10.04861F}, {1.37410F, 1.37343F}),
+        make_rotating_square_test(4, {5.04758F, 3.73075F, 1.12477F, 5.01238F},
+                                  {9.56445F, 15.63086F}, {1.13227F, 1.14002F}),
+        make_rotating_square_test(5, {5.00314F, 3.81633F, 1.16250F, 5.03388F},
+                                  {12.18500F, 23.80125F}, {1.60865F, 1.62053F}),
+        make_rotating_square_test(6, {4.95174F, 3.93273F, 0.77928F, 5.00313F},
+                                  {18.60764F, 50.63194F}, {1.37666F, 1.38456F}),
+        make_rotating_square_test(7, {4.93923F, 3.94952F, 1.01654F, 4.99058F},
+                                  {26.49809F, 55.16071F}, {1.60671F, 1.61532F}),
+        make_rotating_square_test(8, {4.96286F, 3.91138F, 0.94513F, 4.99251F},
+                                  {34.55029F, 57.06738F}, {1.22516F, 1.22873F}),
+        make_rotating_square_test(9, {4.98960F, 3.83407F, 1.07763F, 5.01845F},
+                                  {47.52546F, 60.11883F}, {1.55569F, 1.55563F}),
+        make_rotating_square_test(10, {4.95152F, 3.78445F, 0.96519F, 4.98130F},
+                                  {53.02469F, 54.21344F}, {1.18499F, 1.18493F}),
+        make_rotating_square_test(11, {4.89623F, 3.86437F, 1.06083F, 4.95983F},
+                                  {60.61518F, 56.73373F}, {1.29844F, 1.29850F}),
+        make_rotating_square_test(12, {4.80831F, 3.80666F, 0.94843F, 4.84862F},
+                                  {66.85699F, 60.20595F}, {0.87564F, 0.87557F}),
+        make_rotating_square_test(13, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {71.80584F, 66.91994F}, {0.93144F, 0.93147F}),
+        make_rotating_square_test(14, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {76.03428F, 75.67442F}, {0.64332F, 0.64328F}),
+        make_rotating_square_test(15, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {81.28000F, 84.23681F}, {0.69961F, 0.69964F}),
+        make_rotating_square_test(16, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {84.53223F, 83.54919F}, {0.49254F, 0.49251F}),
+        make_rotating_square_test(17, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {89.60305F, 83.21745F}, {0.54468F, 0.54470F}),
+        make_rotating_square_test(18, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {89.92023F, 79.63406F}, {0.38917F, 0.38914F}),
+        make_rotating_square_test(19, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {91.68473F, 78.35907F}, {0.43605F, 0.43606F}),
+        make_rotating_square_test(20, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {91.01719F, 77.84946F}, {0.31523F, 0.31521F}),
+        make_rotating_square_test(21, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {92.03614F, 80.94381F}, {0.35694F, 0.35696F}),
+        make_rotating_square_test(22, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {89.82928F, 81.32819F}, {0.26052F, 0.26050F}),
+        make_rotating_square_test(23, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {89.75030F, 85.61785F}, {0.29757F, 0.29758F}),
+        make_rotating_square_test(24, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {85.35243F, 82.91254F}, {0.21891F, 0.21889F}),
+        make_rotating_square_test(25, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {83.08310F, 83.38870F}, {0.25186F, 0.25187F}),
+        make_rotating_square_test(26, {4.80835F, 3.81359F, 0.94093F, 4.84875F},
+                                  {79.62486F, 79.25624F}, {0.18653F, 0.18651F}),
+        make_rotating_square_test(27, {4.89751F, 3.89167F, 1.04246F, 4.96033F},
+                                  {78.02598F, 78.54677F}, {0.21593F, 0.21594F}),
+        make_rotating_square_test(28, {4.80836F, 3.81359F, 0.94093F, 4.84875F},
+                                  {76.03428F, 75.67442F}, {0.16083F, 0.16082F}),
+        make_rotating_square_test(29, {4.89743F, 3.89168F, 1.04246F, 4.96033F},
+                                  {75.24889F, 75.44006F}, {0.18717F, 0.18718F}),
+        make_rotating_square_test(30, {4.80836F, 3.81359F, 0.94093F, 4.84875F},
+                                  {72.85309F, 72.46344F}, {0.14010F, 0.14009F}),
+        make_rotating_square_test(31, {4.89743F, 3.89168F, 1.04246F, 4.96033F},
+                                  {71.25634F, 71.65993F},
+                                  {0.16380F, 0.16381F})));
+
+TEST(OpticalFlowTest, MultipleChannelsNotImplemented) {
+  const int channels = 3;
+  const uint8_t prev_image[1] = {}, next_image[1] = {};
+  const int16_t scharr[1] = {};
+  const float prev_points[2] = {};
+  float next_points[2] = {}, err[1] = {};
+  uint8_t status[1] = {};
+  ASSERT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED,
+            kleidicv_opencv_optical_flow_u8(
+                prev_image, sizeof(prev_image), scharr, sizeof(scharr),
+                next_image, sizeof(next_image), 1, 1, channels, prev_points,
+                next_points, 1, status, err, 3, 3, 30, 0.0001, false, 0.0001));
+}
+
+TEST(OpticalFlowTest, NextPointOutOfRange) {
+  // This test causes the predicted next point to fall outside the padded image.
+  // It does this by padding the image with non-zero values, which could be
+  // considered invalid input, but it's tricky to trigger otherwise.
+  const ptrdiff_t width = 1, height = 1;
+  const ptrdiff_t window_width = 3, window_height = 3;
+  const ptrdiff_t padded_width = width + window_width * 2,
+                  padded_height = height + window_height * 2;
+
+  const uint8_t prev_image[padded_width * padded_height] = {
+      201, 181, 4,   64,  133, 70,  35,  133, 127, 190, 61,  233, 62,
+      183, 196, 117, 198, 243, 82,  65,  129, 23,  7,   251, 76,  123,
+      73,  160, 182, 17,  22,  108, 22,  159, 57,  252, 218, 214, 239,
+      125, 92,  132, 7,   28,  176, 188, 135, 212, 125};
+  const uint8_t next_image[padded_width * padded_height] = {
+      64,  242, 103, 5,   88,  143, 139, 219, 107, 63, 81,  42, 50,
+      247, 20,  65,  149, 112, 136, 11,  178, 232, 38, 158, 92, 76,
+      236, 193, 185, 124, 194, 152, 69,  56,  111, 54, 72,  93, 220,
+      5,   188, 108, 232, 250, 124, 150, 116, 222, 248};
+
+  int16_t scharr[padded_width * padded_height * 2] = {};
+  scharr_xy(prev_image, padded_width, scharr, padded_width * sizeof(int16_t),
+            padded_width, padded_height);
+
+  const size_t point_count = 1;
+  float prev_points[point_count * 2] = {0.5F, 0.5F};
+  float next_points[point_count * 2] = {0.5F, 0.5F};
+  float err[point_count] = {};
+  uint8_t status[point_count] = {1};
+
+  for (uint8_t *actual_status : {status, static_cast<uint8_t *>(nullptr)}) {
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_opencv_optical_flow_u8(
+            prev_image + padded_width * window_height + window_width,
+            padded_width, scharr + padded_width * window_height + window_width,
+            sizeof(int16_t) * padded_width,
+            next_image + padded_width * window_height + window_width,
+            padded_width, width, height, 1, prev_points, next_points,
+            point_count, actual_status, err, window_width, window_height,
+            DEFAULT_TERMINATION_COUNT,
+            DEFAULT_TERMINATION_EPSILON * DEFAULT_TERMINATION_EPSILON, false,
+            DEFAULT_MIN_EIGEN_VALS_THRESHOLD));
+  }
+}
+
+TEST(OpticalFlowTest, Fuzz) {
+  const ptrdiff_t width = 3, height = width;
+  const ptrdiff_t window_width = 3, window_height = 3;
+  const ptrdiff_t padded_width = width + window_width * 2,
+                  padded_height = height + window_height * 2;
+  const size_t point_count = width * height;
+  uint8_t prev_image[width * height] = {}, next_image[width * height] = {};
+  float prev_points[point_count * 2] = {};
+  float next_points[point_count * 2] = {}, err[point_count] = {};
+  uint8_t status[point_count] = {};
+
+  std::mt19937_64 rng(test::Options::seed());
+  std::uniform_int_distribution<uint8_t> dist(0, 255);
+  std::uniform_real_distribution<float> distpt(0, width);
+
+  for (int j = 0; j < 1000; ++j) {
+    for (int i = 0; i < width * height; ++i) {
+      prev_image[i] = dist(rng);
+      next_image[i] = dist(rng);
+    }
+
+    const std::vector<uint8_t> padded_prev_image = pad_image_border_reverse(
+        prev_image, width, height, window_width, window_width, window_height);
+    const std::vector<uint8_t> padded_next_image = pad_image_border_reverse(
+        next_image, width, height, window_width, window_width, window_height);
+
+    for (ptrdiff_t y = 0; y < height; ++y) {
+      for (ptrdiff_t x = 0; x < width; ++x) {
+        prev_points[2 * (x + y * width)] = distpt(rng);
+        prev_points[2 * (x + y * width) + 1] = distpt(rng);
+        next_points[2 * (x + y * width)] = distpt(rng);
+        next_points[2 * (x + y * width) + 1] = distpt(rng);
+      }
+    }
+    memcpy(next_points, prev_points, sizeof(next_points));
+    for (size_t i = 0; i < point_count; ++i) {
+      status[i] = 1;
+    }
+
+    int16_t scharr[padded_width * padded_height * 2] = {};
+    scharr_xy(
+        padded_prev_image.data() + padded_width * window_height + window_width,
+        padded_width, scharr + window_height * padded_width + window_width,
+        padded_width * sizeof(int16_t), width, height);
+
+    ASSERT_EQ(
+        KLEIDICV_OK,
+        kleidicv_opencv_optical_flow_u8(
+            padded_prev_image.data() + padded_width * window_height +
+                window_width,
+            padded_width, scharr + padded_width * window_height + window_width,
+            sizeof(int16_t) * padded_width,
+            padded_next_image.data() + padded_width * window_height +
+                window_width,
+            padded_width, width, height, 1, prev_points, next_points,
+            point_count, status, err, window_width, window_height,
+            DEFAULT_TERMINATION_COUNT,
+            DEFAULT_TERMINATION_EPSILON * DEFAULT_TERMINATION_EPSILON, false,
+            DEFAULT_MIN_EIGEN_VALS_THRESHOLD));
+  }
+}
+
+#ifdef KLEIDICV_ALLOCATION_TESTS
+TEST(OpticalFlowTest, CannotAllocateWindow) {
+  MockMallocToFail::enable();
+  const ptrdiff_t width = 1, height = 1;
+  const ptrdiff_t window_width = 100, window_height = 100;
+  const ptrdiff_t padded_width = width + window_width * 2,
+                  padded_height = height + window_height * 2;
+  const uint8_t prev_image[padded_width * padded_height] = {};
+  const uint8_t next_image[padded_width * padded_height] = {};
+  int16_t scharr[padded_width * padded_height * 2] = {};
+  const size_t kPointCount = 1;
+  float prev_points[kPointCount * 2] = {};
+  float next_points[kPointCount * 2] = {};
+  float err[kPointCount];
+  uint8_t status[kPointCount];
+  kleidicv_error_t ret = kleidicv_opencv_optical_flow_u8(
+      prev_image + padded_width * window_height + window_width, padded_width,
+      scharr + padded_width * window_height + window_width,
+      sizeof(int16_t) * padded_width,
+      next_image + padded_width * window_height + window_width, padded_width,
+      width, height, 1, prev_points, next_points, kPointCount, status, err,
+      window_width, window_height, DEFAULT_TERMINATION_COUNT,
+      DEFAULT_TERMINATION_EPSILON * DEFAULT_TERMINATION_EPSILON, false,
+      DEFAULT_MIN_EIGEN_VALS_THRESHOLD);
+  MockMallocToFail::disable();
+  ASSERT_EQ(KLEIDICV_ERROR_ALLOCATION, ret);
+}
+#endif
diff --git a/test/api/test_small_buffer.cpp b/test/api/test_small_buffer.cpp
new file mode 100644
index 000000000..ebda46b34
--- /dev/null
+++ b/test/api/test_small_buffer.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "kleidicv/containers/small_buffer.h"
+
+template <typename ElementType>
+class SmallBufferTest : public testing::Test {};
+
+using ElementTypes = ::testing::Types<int16_t>;
+
+TYPED_TEST_SUITE(SmallBufferTest, ElementTypes);
+
+TYPED_TEST(SmallBufferTest, Stack) {
+  kleidicv::SmallBuffer<TypeParam, 10> buf{1};
+  buf.get()[0] = -12345;
+  EXPECT_EQ(-12345, buf.get()[0]);
+  EXPECT_LT(abs(reinterpret_cast<ptrdiff_t>(buf.get()) -
+                reinterpret_cast<ptrdiff_t>(&buf)),
+            sizeof(buf));
+}
+
+TYPED_TEST(SmallBufferTest, Heap) {
+  kleidicv::SmallBuffer<TypeParam, 10> buf{123};
+  buf.get()[100] = -12345;
+  EXPECT_EQ(-12345, buf.get()[100]);
+  EXPECT_GE(abs(reinterpret_cast<ptrdiff_t>(buf.get()) -
+                reinterpret_cast<ptrdiff_t>(&buf)),
+            sizeof(buf));
+}
-- 
GitLab