From fa397418601398046b2caf6c6a02ff9753a1870c Mon Sep 17 00:00:00 2001 From: Mark Horvath Date: Fri, 4 Oct 2024 14:47:35 +0000 Subject: [PATCH] Fix allocation size in SeparableFilterWorkspace Some extra space is allocated to account for SVE interleaving stores, but the amount of space needed depends on the element size of the intermediate buffer. With this change 3 extra elements are allocated, not just 3 more bytes. (This is true for single-channel input, for multi-channel input more data was allocated.) So far, it was not a problem, as in the worst case we are using 32bit intermediate type with svst4, where 12 bytes of extra space is needed. But, the size of the allocation is also extended by kAlignment-1, which equals to 15. So, in total 18 more bytes were allocated for single-channel input. --- kleidicv/include/kleidicv/workspace/separable.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kleidicv/include/kleidicv/workspace/separable.h b/kleidicv/include/kleidicv/workspace/separable.h index 899bccf69..dd10268df 100644 --- a/kleidicv/include/kleidicv/workspace/separable.h +++ b/kleidicv/include/kleidicv/workspace/separable.h @@ -83,14 +83,16 @@ class SeparableFilterWorkspace final { static Pointer create(Rectangle rect, size_t channels, size_t intermediate_size) KLEIDICV_STREAMING_COMPATIBLE { - size_t buffer_rows_width = intermediate_size * rect.width(); + size_t buffer_rows_number_of_elements = rect.width() * channels; // Adding more elements because of SVE, where interleaving stores are // governed by one predicate. For example, if a predicate requires 7 uint8_t // elements and an algorithm performs widening to 16 bits, the resulting // interleaving store will still be governed by the same predicate, thus - // saving 8 elements. Choosing '3' to account for svst4(). - buffer_rows_width += 3; - size_t buffer_rows_stride = buffer_rows_width * channels; + // storing 8 elements. Choosing '3' to account for svst4(). + buffer_rows_number_of_elements += 3; + + size_t buffer_rows_stride = + buffer_rows_number_of_elements * intermediate_size; size_t buffer_rows_size = buffer_rows_stride; buffer_rows_size += kAlignment - 1; -- GitLab