From cab108ddd81d5ef7a89ae935e4a383dcec84faa1 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Fri, 16 Aug 2024 04:45:10 +0000 Subject: [PATCH 1/3] Multithread resize --- adapters/opencv/kleidicv_hal.cpp | 14 +- .../include/kleidicv/resize/resize_linear.h | 75 ++- kleidicv/src/resize/resize_linear_api.cpp | 41 +- kleidicv/src/resize/resize_linear_neon.cpp | 207 ++++--- kleidicv/src/resize/resize_linear_sc.h | 578 +++++++++--------- kleidicv/src/resize/resize_linear_sme2.cpp | 24 +- kleidicv/src/resize/resize_linear_sve2.cpp | 27 +- .../include/kleidicv_thread/kleidicv_thread.h | 80 +++ kleidicv_thread/src/kleidicv_thread.cpp | 53 ++ scripts/ci.sh | 2 +- test/api/test_resize_linear.cpp | 6 + test/api/test_thread_resize.cpp | 112 ++++ 12 files changed, 769 insertions(+), 450 deletions(-) create mode 100644 test/api/test_thread_resize.cpp diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 323ca67f7..8c169eae3 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -755,9 +755,9 @@ int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, inv_scale_y == 0.5 && (interpolation == CV_HAL_INTER_LINEAR || interpolation == CV_HAL_INTER_AREA)) { - return convert_error(kleidicv_resize_to_quarter_u8( + return convert_error(kleidicv_thread_resize_to_quarter_u8( src_data, src_step, src_width, src_height, dst_data, dst_step, - dst_width, dst_height)); + dst_width, dst_height, get_multithreading())); } if (interpolation != CV_HAL_INTER_LINEAR) { @@ -773,14 +773,14 @@ int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, switch (CV_MAT_DEPTH(src_type)) { case CV_8U: - return convert_error( - kleidicv_resize_linear_u8(src_data, src_step, src_width, src_height, - dst_data, dst_step, dst_width, dst_height)); + return convert_error(kleidicv_thread_resize_linear_u8( + src_data, src_step, src_width, src_height, dst_data, dst_step, + dst_width, dst_height, get_multithreading())); case CV_32F: - return convert_error(kleidicv_resize_linear_f32( + return convert_error(kleidicv_thread_resize_linear_f32( reinterpret_cast(src_data), src_step, src_width, src_height, reinterpret_cast(dst_data), dst_step, dst_width, - dst_height)); + dst_height, get_multithreading())); } return CV_HAL_ERROR_NOT_IMPLEMENTED; } diff --git a/kleidicv/include/kleidicv/resize/resize_linear.h b/kleidicv/include/kleidicv/resize/resize_linear.h index 55aa03400..d31db4d25 100644 --- a/kleidicv/include/kleidicv/resize/resize_linear.h +++ b/kleidicv/include/kleidicv/resize/resize_linear.h @@ -10,38 +10,65 @@ namespace kleidicv { namespace neon { -kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, - size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height); -kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, - size_t src_width, size_t src_height, - float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height); +kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); +kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); } // namespace neon namespace sve2 { -kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, - size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height); -kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, - size_t src_width, size_t src_height, - float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height); +kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); +kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); } // namespace sve2 namespace sme2 { -kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, - size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height); -kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, - size_t src_width, size_t src_height, - float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height); +kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); +kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); } // namespace sme2 } // namespace kleidicv +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus +/// Internal - not part of the public API and its direct use is not supported. +/// It is used by the multithreaded function. +extern kleidicv_error_t (*kleidicv_resize_linear_stripe_u8)( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); + +/// Internal - not part of the public API and its direct use is not supported. +/// It is used by the multithreaded function. +extern kleidicv_error_t (*kleidicv_resize_linear_stripe_f32)( + const float *src, size_t src_stride, size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + #endif // KLEIDICV_RESIZE_RESIZE_H diff --git a/kleidicv/src/resize/resize_linear_api.cpp b/kleidicv/src/resize/resize_linear_api.cpp index a838caba8..97c71ff18 100644 --- a/kleidicv/src/resize/resize_linear_api.cpp +++ b/kleidicv/src/resize/resize_linear_api.cpp @@ -7,11 +7,40 @@ #include "kleidicv/resize/resize_linear.h" KLEIDICV_MULTIVERSION_C_API( - kleidicv_resize_linear_u8, &kleidicv::neon::resize_linear_u8, - KLEIDICV_SVE2_IMPL_IF(&kleidicv::sve2::resize_linear_u8), - &kleidicv::sme2::resize_linear_u8); + kleidicv_resize_linear_stripe_u8, &kleidicv::neon::resize_linear_stripe_u8, + KLEIDICV_SVE2_IMPL_IF(&kleidicv::sve2::resize_linear_stripe_u8), + &kleidicv::sme2::resize_linear_stripe_u8); KLEIDICV_MULTIVERSION_C_API( - kleidicv_resize_linear_f32, &kleidicv::neon::resize_linear_f32, - KLEIDICV_SVE2_IMPL_IF(&kleidicv::sve2::resize_linear_f32), - &kleidicv::sme2::resize_linear_f32); + kleidicv_resize_linear_stripe_f32, + &kleidicv::neon::resize_linear_stripe_f32, + KLEIDICV_SVE2_IMPL_IF(&kleidicv::sve2::resize_linear_stripe_f32), + &kleidicv::sme2::resize_linear_stripe_f32); + +namespace kleidicv { + +static kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) { + return kleidicv_resize_linear_stripe_u8(src, src_stride, src_width, + src_height, 0, src_height, dst, + dst_stride, dst_width, dst_height); +} + +static kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, + size_t src_width, size_t src_height, + float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) { + return kleidicv_resize_linear_stripe_f32(src, src_stride, src_width, + src_height, 0, src_height, dst, + dst_stride, dst_width, dst_height); +} + +} // namespace kleidicv + +KLEIDICV_MULTIVERSION_C_API(kleidicv_resize_linear_u8, + &kleidicv::resize_linear_u8, nullptr, nullptr); + +KLEIDICV_MULTIVERSION_C_API(kleidicv_resize_linear_f32, + &kleidicv::resize_linear_f32, nullptr, nullptr); diff --git a/kleidicv/src/resize/resize_linear_neon.cpp b/kleidicv/src/resize/resize_linear_neon.cpp index 5f0d01852..e3a652870 100644 --- a/kleidicv/src/resize/resize_linear_neon.cpp +++ b/kleidicv/src/resize/resize_linear_neon.cpp @@ -59,7 +59,7 @@ uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c, KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride) { + size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { size_t dst_width = src_width * 2; auto lerp1d_scalar = [](uint8_t near, uint8_t far) { @@ -167,10 +167,12 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8( }; // Top row - process_edge_row(src, dst); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 2 + 1; const uint8_t *src_row0 = src + src_stride * src_y; const uint8_t *src_row1 = src_row0 + src_stride; @@ -181,15 +183,17 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8( } // Bottom row - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (src_height * 2 - 1)); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (src_height * 2 - 1)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride) { + size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride) { size_t dst_width = src_width * 4, dst_height = src_height * 4; auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, @@ -383,11 +387,13 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8( }; // Top rows - process_edge_row(src, dst); - memcpy(dst + dst_stride, dst, dst_stride); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + memcpy(dst + dst_stride, dst, dst_stride); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 4 + 2; const uint8_t *src_row0 = src + src_stride * src_y; const uint8_t *src_row1 = src_row0 + src_stride; @@ -400,19 +406,22 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8( } // Bottom rows - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 2)); - memcpy(dst + dst_stride * (dst_height - 1), - dst + dst_stride * (dst_height - 2), dst_stride); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + memcpy(dst + dst_stride * (dst_height - 1), + dst + dst_stride * (dst_height - 2), dst_stride); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, - size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { +kleidicv_error_t resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(dst_width, dst_height); @@ -421,19 +430,19 @@ kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, return KLEIDICV_OK; } if (src_width * 2 == dst_width && src_height * 2 == dst_height) { - return resize_2x2_u8(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_2x2_u8(src, src_stride, src_width, src_height, y_begin, y_end, + dst, dst_stride); } if (src_width * 4 == dst_width && src_height * 4 == dst_height) { - return resize_4x4_u8(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_4x4_u8(src, src_stride, src_width, src_height, y_begin, y_end, + dst, dst_stride); } return KLEIDICV_ERROR_NOT_IMPLEMENTED; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) { + size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { size_t dst_width = src_width * 2; src_stride /= sizeof(float); dst_stride /= sizeof(float); @@ -528,10 +537,12 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32( }; // Top row - process_edge_row(src, dst); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 2 + 1; const float *src_row0 = src + src_stride * src_y; const float *src_row1 = src_row0 + src_stride; @@ -542,15 +553,17 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32( } // Bottom row - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (src_height * 2 - 1)); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (src_height * 2 - 1)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) { + size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { using T = float; size_t dst_height = src_height * 4; size_t dst_width = src_width * 4; @@ -626,7 +639,6 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r); // Middle elements - size_t src_x = 0; for (; src_x + 4 < src_width; src_x += 4) { size_t dst_x = src_x * 4 + 2; @@ -709,11 +721,13 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( }; // Top rows - process_edge_row(src, dst); - memcpy(dst + dst_stride, dst, dst_stride * sizeof(T)); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + memcpy(dst + dst_stride, dst, dst_stride * sizeof(T)); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 4 + 2; const T *src_row0 = src + src_stride * src_y; const T *src_row1 = src_row0 + src_stride; @@ -726,17 +740,19 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( } // Bottom rows - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 2)); - memcpy(dst + dst_stride * (dst_height - 1), - dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T)); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + memcpy(dst + dst_stride * (dst_height - 1), + dst + dst_stride * (dst_height - 2), dst_stride * sizeof(T)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) { + size_t y_begin, size_t y_end, float *dst, size_t dst_stride) { size_t dst_width = src_width * 8; size_t dst_height = src_height * 8; src_stride /= sizeof(float); @@ -762,8 +778,32 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( // Handle top or bottom edge auto process_edge_row = - [src_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, + [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) { + // Left elements + dst_row[3] = dst_row[2] = dst_row[1] = dst_row[0] = src_row[0]; + dst_row[dst_stride + 3] = dst_row[dst_stride + 2] = + dst_row[dst_stride + 1] = dst_row[dst_stride] = src_row[0]; + dst_row[2 * dst_stride + 3] = dst_row[2 * dst_stride + 2] = + dst_row[2 * dst_stride + 1] = dst_row[2 * dst_stride] = src_row[0]; + dst_row[3 * dst_stride + 3] = dst_row[3 * dst_stride + 2] = + dst_row[3 * dst_stride + 1] = dst_row[3 * dst_stride] = src_row[0]; + + // Right elements + float *dst_right = dst_row + dst_width - 4; + dst_right[3] = dst_right[2] = dst_right[1] = dst_right[0] = + src_row[src_width - 1]; + dst_right[dst_stride + 3] = dst_right[dst_stride + 2] = + dst_right[dst_stride + 1] = dst_right[dst_stride] = + src_row[src_width - 1]; + dst_right[2 * dst_stride + 3] = dst_right[2 * dst_stride + 2] = + dst_right[2 * dst_stride + 1] = dst_right[2 * dst_stride] = + src_row[src_width - 1]; + dst_right[3 * dst_stride + 3] = dst_right[3 * dst_stride + 2] = + dst_right[3 * dst_stride + 1] = dst_right[3 * dst_stride] = + src_row[src_width - 1]; + + // Middle elements float32x4_t a, b = vdupq_n_f32(src_row[0]); for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { a = b; @@ -800,11 +840,27 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s); }; - auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n, &coeffs_p0, - &coeffs_q0, &coeffs_r0, &coeffs_s0, &coeffs_p1, - &coeffs_q1, &coeffs_r1, + auto process_row = [src_width, dst_width, lerp2d_vector, lerp1d_vector_n, + &coeffs_p0, &coeffs_q0, &coeffs_r0, &coeffs_s0, + &coeffs_p1, &coeffs_q1, &coeffs_r1, &coeffs_s1](const float *src_row0, const float *src_row1, float *dst_row0, size_t dst_stride) { + // Left & right elements + const float s0l = src_row0[0], s1l = src_row1[0]; + const float s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + float *dst_row = dst_row0; + for (size_t i = 0; i < 8; ++i) { + vst1q(dst_row, lerp1d_vector_n(static_cast(15 - i * 2) / 16.0F, + vdupq_n_f32(s0l), + static_cast(i * 2 + 1) / 16.0F, + vdupq_n_f32(s1l))); + vst1q(dst_row + dst_width - 4, + lerp1d_vector_n( + static_cast(15 - i * 2) / 16.0F, vdupq_n_f32(s0r), + static_cast(i * 2 + 1) / 16.0F, vdupq_n_f32(s1r))); + dst_row += dst_stride; + } + // Middle elements dst_row0 += 4; float *dst_row1 = dst_row0 + dst_stride; @@ -864,48 +920,13 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( } }; - // Corners - auto set_corner = [dst, dst_stride](size_t left_column, size_t top_row, - float value) { - float *row = dst + dst_stride * top_row + left_column; - for (size_t i = 0; i < 4; ++i) { - for (size_t j = 0; j < 4; ++j) { - row[j] = value; - } - row += dst_stride; - } - }; - set_corner(0, 0, src[0]); - set_corner(dst_width - 4, 0, src[src_width - 1]); - set_corner(0, dst_height - 4, src[src_stride * (src_height - 1)]); - set_corner(dst_width - 4, dst_height - 4, - src[src_stride * (src_height - 1) + src_width - 1]); - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - float *dst_row = dst + dst_stride * (src_y * 8 + 4); - const float *src_row0 = src + src_stride * src_y; - const float *src_row1 = src_row0 + src_stride; - const float s0l = src_row0[0], s1l = src_row1[0]; - const float s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; - for (size_t i = 0; i < 8; ++i) { - vst1q(dst_row, lerp1d_vector_n(static_cast(15 - i * 2) / 16.0F, - vdupq_n_f32(s0l), - static_cast(i * 2 + 1) / 16.0F, - vdupq_n_f32(s1l))); - vst1q(dst_row + dst_width - 4, - lerp1d_vector_n( - static_cast(15 - i * 2) / 16.0F, vdupq_n_f32(s0r), - static_cast(i * 2 + 1) / 16.0F, vdupq_n_f32(s1r))); - dst_row += dst_stride; - } - } - // Top rows - process_edge_row(src, dst, dst_stride); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst, dst_stride); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 8 + 4; const float *src_row0 = src + src_stride * src_y; const float *src_row1 = src_row0 + src_stride; @@ -913,17 +934,19 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( } // Bottom rows - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 4), dst_stride); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 4), dst_stride); + } return KLEIDICV_OK; } -KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, - size_t src_width, size_t src_height, - float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { +kleidicv_error_t resize_linear_stripe_f32(const float *src, size_t src_stride, + size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, + float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(dst_width, dst_height); @@ -932,16 +955,16 @@ kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, return KLEIDICV_OK; } if (src_width * 2 == dst_width && src_height * 2 == dst_height) { - return resize_2x2_f32(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_2x2_f32(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } if (src_width * 4 == dst_width && src_height * 4 == dst_height) { - return resize_4x4_f32(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_4x4_f32(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } if (src_width * 8 == dst_width && src_height * 8 == dst_height) { - return resize_8x8_f32(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_8x8_f32(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } return KLEIDICV_ERROR_NOT_IMPLEMENTED; } diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index c43b1e497..920f64da6 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -14,7 +14,8 @@ namespace KLEIDICV_TARGET_NAMESPACE { KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, uint8_t *dst, + size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { size_t dst_width = src_width * 2; size_t dst_height = src_height * 2; @@ -76,8 +77,14 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( #define KSC KLEIDICV_STREAMING_COMPATIBLE // Handle top or bottom edge - auto process_edge_row = [src_width, lerp1d_vector](const uint8_t *src_row, - uint8_t *dst_row) KSC { + auto process_edge_row = [src_width, dst_width, lerp1d_vector]( + const uint8_t *src_row, uint8_t *dst_row) KSC { + // Left element + dst_row[0] = src_row[0]; + + // Right element + dst_row[dst_width - 1] = src_row[src_width - 1]; + for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { size_t dst_x = src_x * 2 + 1; @@ -93,10 +100,20 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( } }; - auto process_row = [src_width, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_vector]( const uint8_t *src_row0, const uint8_t *src_row1, uint8_t *dst_row0, uint8_t *dst_row1) KLEIDICV_STREAMING_COMPATIBLE { + // Left edge + dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); + dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); + + // Right edge + dst_row0[dst_width - 1] = + lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); + dst_row1[dst_width - 1] = + lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { size_t dst_x = src_x * 2 + 1; @@ -118,37 +135,13 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( } }; - // Corners - dst[0] = src[0]; - dst[dst_width - 1] = src[src_width - 1]; - dst[dst_stride * (dst_height - 1)] = src[src_stride * (src_height - 1)]; - dst[dst_stride * (dst_height - 1) + dst_width - 1] = - src[src_stride * (src_height - 1) + src_width - 1]; - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - size_t dst_y = src_y * 2 + 1; - const uint8_t *src_row0 = src + src_stride * src_y; - const uint8_t *src_row1 = src_row0 + src_stride; - uint8_t *dst_row0 = dst + dst_stride * dst_y; - uint8_t *dst_row1 = dst_row0 + dst_stride; - - // Left edge - dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); - dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); - - // Right edge - dst_row0[dst_width - 1] = - lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); - dst_row1[dst_width - 1] = - lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); - } - // Top row - process_edge_row(src, dst); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 2 + 1; const uint8_t *src_row0 = src + src_stride * src_y; const uint8_t *src_row1 = src_row0 + src_stride; @@ -159,15 +152,18 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( } // Bottom row - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 1)); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 1)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, uint8_t *dst, + size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { size_t dst_width = src_width * 4; size_t dst_height = src_height * 4; @@ -220,8 +216,15 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( }; // Handle top or bottom edge - auto process_edge_row = [src_width, lerp1d_vector](const uint8_t *src_row, - uint8_t *dst_row) KSC { + auto process_edge_row = [src_width, dst_width, lerp1d_vector]( + const uint8_t *src_row, uint8_t *dst_row) KSC { + // Left elements + dst_row[1] = dst_row[0] = src_row[0]; + + // Right elements + dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; + + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { size_t dst_x = src_x * 4 + 2; svbool_t pg = svwhilelt_b8(src_x + 1, src_width); @@ -233,11 +236,27 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( } }; - auto process_row = [src_width, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_vector]( const uint8_t *src_row0, const uint8_t *src_row1, uint8_t *dst_row0, uint8_t *dst_row1, uint8_t *dst_row2, uint8_t *dst_row3) KLEIDICV_STREAMING_COMPATIBLE { + // Left elements + const uint8_t s0l = src_row0[0], s1l = src_row1[0]; + dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); + dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); + dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); + dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); + + // Right elements + const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + const size_t dr0 = dst_width - 2; + const size_t dr1 = dst_width - 1; + dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); + dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); + dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); + dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { size_t dst_x = src_x * 4 + 2; @@ -273,47 +292,6 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( } }; - // Corners - auto set_corner = [dst, dst_stride](size_t left_column, size_t top_row, - uint8_t value) KSC { - dst[dst_stride * top_row + left_column] = value; - dst[dst_stride * top_row + left_column + 1] = value; - dst[dst_stride * (top_row + 1) + left_column] = value; - dst[dst_stride * (top_row + 1) + left_column + 1] = value; - }; - set_corner(0, 0, src[0]); - set_corner(dst_width - 2, 0, src[src_width - 1]); - set_corner(0, dst_height - 2, src[src_stride * (src_height - 1)]); - set_corner(dst_width - 2, dst_height - 2, - src[src_stride * (src_height - 1) + src_width - 1]); - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - size_t dst_y = src_y * 4 + 2; - const uint8_t *src_row0 = src + src_stride * src_y; - const uint8_t *src_row1 = src_row0 + src_stride; - uint8_t *dst_row0 = dst + dst_stride * dst_y; - uint8_t *dst_row1 = dst_row0 + dst_stride; - uint8_t *dst_row2 = dst_row1 + dst_stride; - uint8_t *dst_row3 = dst_row2 + dst_stride; - - // Left elements - const uint8_t s0l = src_row0[0], s1l = src_row1[0]; - dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); - dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); - dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); - dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); - - // Right elements - const uint8_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; - const size_t dr0 = dst_width - 2; - const size_t dr1 = dst_width - 1; - dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); - dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); - dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); - dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); - } - auto copy_dst_row = [src_width](const uint8_t *dst_from, uint8_t *dst_to) KSC { for (size_t i = 0; i < src_width; i += svcntb()) { @@ -322,12 +300,14 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( } }; - // Top row - process_edge_row(src, dst); - copy_dst_row(dst, dst + dst_stride); + // Top rows + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + copy_dst_row(dst, dst + dst_stride); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 4 + 2; const uint8_t *src_row0 = src + src_stride * src_y; const uint8_t *src_row1 = src_row0 + src_stride; @@ -339,20 +319,22 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); } - // Bottom row - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 2)); - copy_dst_row(dst + dst_stride * (dst_height - 2), - dst + dst_stride * (dst_height - 1)); + // Bottom rows + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + copy_dst_row(dst + dst_stride * (dst_height - 2), + dst + dst_stride * (dst_height - 1)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, float *dst, + size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { size_t dst_width = src_width * 2; - size_t dst_height = src_height * 2; src_stride /= sizeof(float); dst_stride /= sizeof(float); @@ -381,8 +363,12 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( #define KSC KLEIDICV_STREAMING_COMPATIBLE // Handle top or bottom edge - auto process_edge_row = [src_width, lerp1d_vector](const float *src_row, - float *dst_row) KSC { + auto process_edge_row = [src_width, dst_width, lerp1d_vector]( + const float *src_row, float *dst_row) KSC { + // Left element + dst_row[0] = src_row[0]; + + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) { size_t dst_x = src_x * 2 + 1; @@ -394,11 +380,18 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( svst2_f32(pg, dst_row + dst_x, svcreate2(lerp1d_vector(pg, a, b), lerp1d_vector(pg, b, a))); } + + // Right element + dst_row[dst_width - 1] = src_row[src_width - 1]; }; - auto process_row = [src_width, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_vector]( const float *src_row0, const float *src_row1, float *dst_row0, float *dst_row1) KSC { + // Left element + dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); + dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) { size_t dst_x = src_x * 2 + 1; @@ -417,39 +410,20 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( svcreate2(lerp2d_vector(pg, c, a, d, b), lerp2d_vector(pg, d, b, c, a))); } - }; - // Corners - dst[0] = src[0]; - dst[dst_width - 1] = src[src_width - 1]; - dst[dst_stride * (dst_height - 1)] = src[src_stride * (src_height - 1)]; - dst[dst_stride * (dst_height - 1) + dst_width - 1] = - src[src_stride * (src_height - 1) + src_width - 1]; - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - size_t dst_y = src_y * 2 + 1; - const float *src_row0 = src + src_stride * src_y; - const float *src_row1 = src_row0 + src_stride; - float *dst_row0 = dst + dst_stride * dst_y; - float *dst_row1 = dst_row0 + dst_stride; - - // Left edge - dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); - dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); - - // Right edge + // Right element dst_row0[dst_width - 1] = lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); dst_row1[dst_width - 1] = lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); - } + }; // Top row - process_edge_row(src, dst); - + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 2 + 1; const float *src_row0 = src + src_stride * src_y; const float *src_row1 = src_row0 + src_stride; @@ -460,24 +434,23 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( } // Bottom row - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 1)); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (src_height * 2 - 1)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, float *dst, + size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { size_t dst_width = src_width * 4; size_t dst_height = src_height * 4; src_stride /= sizeof(float); dst_stride /= sizeof(float); - auto lerp1d_scalar = - [](float p, float a, float q, float b) - KLEIDICV_STREAMING_COMPATIBLE { return p * a + q * b; }; - auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q, svfloat32_t b) KSC { return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q); }; @@ -493,25 +466,46 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( }; // Handle top or bottom edge - auto process_edge_row = [src_width, lerp1d_vector](const float *src_row, - float *dst_row) KSC { + auto process_edge_row = [src_width, dst_width, dst_stride, lerp1d_vector]( + const float *src_row, float *dst_row) KSC { + // Left elements + dst_row[1] = dst_row[0] = dst_row[dst_stride + 1] = dst_row[dst_stride] = + src_row[0]; + + // Right elements + dst_row[dst_width - 1] = dst_row[dst_width - 2] = + dst_row[dst_stride + dst_width - 1] = + dst_row[dst_stride + dst_width - 2] = src_row[src_width - 1]; + + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) { size_t dst_x = src_x * 4 + 2; svbool_t pg = svwhilelt_b32(src_x + 1, src_width); svfloat32_t a = svld1_f32(pg, src_row + src_x); svfloat32_t b = svld1_f32(pg, src_row + src_x + 1); - svst4_f32(pg, dst_row + dst_x, - svcreate4(lerp1d_vector(pg, 0.875F, a, 0.125F, b), - lerp1d_vector(pg, 0.625F, a, 0.375F, b), - lerp1d_vector(pg, 0.375F, a, 0.625F, b), - lerp1d_vector(pg, 0.125F, a, 0.875F, b))); + svfloat32x4_t result = svcreate4(lerp1d_vector(pg, 0.875F, a, 0.125F, b), + lerp1d_vector(pg, 0.625F, a, 0.375F, b), + lerp1d_vector(pg, 0.375F, a, 0.625F, b), + lerp1d_vector(pg, 0.125F, a, 0.875F, b)); + svst4_f32(pg, dst_row + dst_x, result); + svst4_f32(pg, dst_row + dst_stride + dst_x, result); } }; - auto process_row = [src_width, lerp1d_vector, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const float *src_row0, const float *src_row1, float *dst_row0, float *dst_row1, float *dst_row2, - float *dst_row3) KLEIDICV_STREAMING_COMPATIBLE { + float *dst_row3) KSC { + // Left elements + svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element + svbool_t pg2 = svptrue_pat_b32(SV_VL2); // write 2 elements + svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0); + svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0); + svst1(pg2, dst_row0, lerp1d_vector(pg2, 0.875F, s0l, 0.125F, s1l)); + svst1(pg2, dst_row1, lerp1d_vector(pg2, 0.625F, s0l, 0.375F, s1l)); + svst1(pg2, dst_row2, lerp1d_vector(pg2, 0.375F, s0l, 0.625F, s1l)); + svst1(pg2, dst_row3, lerp1d_vector(pg2, 0.125F, s0l, 0.875F, s1l)); + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) { size_t dst_x = src_x * 4 + 2; @@ -564,62 +558,27 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( svget4(dst_d, 3)))); svst4_f32(pg, dst_row3 + dst_x, dst_d); } - }; - - // Corners - auto set_corner = [dst, dst_stride](size_t left_column, size_t top_row, - float value) KSC { - dst[dst_stride * top_row + left_column] = value; - dst[dst_stride * top_row + left_column + 1] = value; - dst[dst_stride * (top_row + 1) + left_column] = value; - dst[dst_stride * (top_row + 1) + left_column + 1] = value; - }; - set_corner(0, 0, src[0]); - set_corner(dst_width - 2, 0, src[src_width - 1]); - set_corner(0, dst_height - 2, src[src_stride * (src_height - 1)]); - set_corner(dst_width - 2, dst_height - 2, - src[src_stride * (src_height - 1) + src_width - 1]); - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - size_t dst_y = src_y * 4 + 2; - const float *src_row0 = src + src_stride * src_y; - const float *src_row1 = src_row0 + src_stride; - float *dst_row0 = dst + dst_stride * dst_y; - float *dst_row1 = dst_row0 + dst_stride; - float *dst_row2 = dst_row1 + dst_stride; - float *dst_row3 = dst_row2 + dst_stride; - - // Left elements - const float s0l = src_row0[0], s1l = src_row1[0]; - dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l); - dst_row1[0] = dst_row1[1] = lerp1d_scalar(0.625F, s0l, 0.375F, s1l); - dst_row2[0] = dst_row2[1] = lerp1d_scalar(0.375F, s0l, 0.625F, s1l); - dst_row3[0] = dst_row3[1] = lerp1d_scalar(0.125F, s0l, 0.875F, s1l); // Right elements - const float s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; - const size_t dr0 = dst_width - 2; - const size_t dr1 = dst_width - 1; - dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(0.875F, s0r, 0.125F, s1r); - dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(0.625F, s0r, 0.375F, s1r); - dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(0.375F, s0r, 0.625F, s1r); - dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(0.125F, s0r, 0.875F, s1r); - } - - auto copy_dst_row = [src_width](const float *dst_from, float *dst_to) KSC { - for (size_t i = 0; i < src_width; i += svcntw()) { - svbool_t pg = svwhilelt_b32(i, src_width); - svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4)); - } + svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0); + svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0); + svst1(pg2, dst_row0 + dst_width - 2, + lerp1d_vector(pg2, 0.875F, s0r, 0.125F, s1r)); + svst1(pg2, dst_row1 + dst_width - 2, + lerp1d_vector(pg2, 0.625F, s0r, 0.375F, s1r)); + svst1(pg2, dst_row2 + dst_width - 2, + lerp1d_vector(pg2, 0.375F, s0r, 0.625F, s1r)); + svst1(pg2, dst_row3 + dst_width - 2, + lerp1d_vector(pg2, 0.125F, s0r, 0.875F, s1r)); }; - // Top row - process_edge_row(src, dst); - copy_dst_row(dst, dst + dst_stride); + // Top rows + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 4 + 2; const float *src_row0 = src + src_stride * src_y; const float *src_row1 = src_row0 + src_stride; @@ -631,18 +590,18 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); } - // Bottom row - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 2)); - copy_dst_row(dst + dst_stride * (dst_height - 2), - dst + dst_stride * (dst_height - 1)); - + // Bottom rows + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, float *dst, + size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { size_t dst_width = src_width * 8; size_t dst_height = src_height * 8; src_stride /= sizeof(float); @@ -669,8 +628,20 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( // Handle top or bottom edge auto process_edge_row = - [src_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, + [src_width, dst_width, lerp1d_vector, &coeffs_a0, &coeffs_a1, &coeffs_b0, &coeffs_b1](const float *src_row, float *dst_row, size_t dst_stride) { + // Left elements + float left = src_row[0]; + float *dst = dst_row; + for (size_t i = 0; i < 4; ++i) { + *dst++ = left; + *dst++ = left; + *dst++ = left; + *dst = left; + dst += dst_stride - 3; + } + + // Middle elements svfloat32_t a, b = svdup_n_f32(src_row[0]); for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { a = b; @@ -691,6 +662,17 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( svst1(svptrue_b32(), dst_row2 + 4, dst); svst1(svptrue_b32(), dst_row3 + 4, dst); } + + // Right elements + dst = dst_row + dst_width - 4; + float right = src_row[src_width - 1]; + for (size_t i = 0; i < 4; ++i) { + *dst++ = right; + *dst++ = right; + *dst++ = right; + *dst = right; + dst += dst_stride - 3; + } }; svfloat32_t coeffs_p0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 15.0 / 16); @@ -716,6 +698,19 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( const float *src_row0, const float *src_row1, float *dst_row0, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + // Left elements + svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element + svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements + float *dst_lr = dst_row0; + svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0); + svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0); + for (size_t i = 0; i < 8; ++i) { + svst1(pg4, dst_lr, + lerp1d_vector_n(pg4, static_cast(15 - i * 2) / 16.0F, s0l, + static_cast(i * 2 + 1) / 16.0F, s1l)); + dst_lr += dst_stride; + } + // Middle elements dst_row0 += 4; float *dst_row1 = dst_row0 + dst_stride; @@ -785,52 +780,26 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( dst_row6 += 4; dst_row7 += 4; } - }; - // Corners - auto set_corner = [dst, dst_stride](size_t left_column, size_t top_row, - float value) { - float *row = dst + dst_stride * top_row + left_column; - for (size_t i = 0; i < 4; ++i) { - for (size_t j = 0; j < 4; ++j) { - row[j] = value; - } - row += dst_stride; - } - }; - set_corner(0, 0, src[0]); - set_corner(dst_width - 4, 0, src[src_width - 1]); - set_corner(0, dst_height - 4, src[src_stride * (src_height - 1)]); - set_corner(dst_width - 4, dst_height - 4, - src[src_stride * (src_height - 1) + src_width - 1]); - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - float *dst_row = dst + dst_stride * (src_y * 8 + 4); - const float *src_row0 = src + src_stride * src_y; - const float *src_row1 = src_row0 + src_stride; - const float s0l = src_row0[0], s1l = src_row1[0]; - const float s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + // Right elements + dst_lr = dst_row0; + svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0); + svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0); for (size_t i = 0; i < 8; ++i) { - svst1( - svptrue_b32(), dst_row, - lerp1d_vector_n(svptrue_b32(), static_cast(15 - i * 2) / 16.0F, - svdup_f32(s0l), static_cast(i * 2 + 1) / 16.0F, - svdup_f32(s1l))); - svst1( - svptrue_b32(), dst_row + dst_width - 4, - lerp1d_vector_n(svptrue_b32(), static_cast(15 - i * 2) / 16.0F, - svdup_f32(s0r), static_cast(i * 2 + 1) / 16.0F, - svdup_f32(s1r))); - dst_row += dst_stride; + svst1(pg4, dst_lr, + lerp1d_vector_n(pg4, static_cast(15 - i * 2) / 16.0F, s0r, + static_cast(i * 2 + 1) / 16.0F, s1r)); + dst_lr += dst_stride; } - } + }; // Top rows - process_edge_row(src, dst, dst_stride); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst, dst_stride); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 8 + 4; const float *src_row0 = src + src_stride * src_y; const float *src_row1 = src_row0 + src_stride; @@ -838,15 +807,18 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( } // Bottom rows - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 4), dst_stride); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 4), dst_stride); + } return KLEIDICV_OK; } KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, float *dst, + size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { size_t dst_width = src_width * 8; size_t dst_height = src_height * 8; src_stride /= sizeof(float); @@ -899,11 +871,23 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( }; // Handle top or bottom edge - auto process_edge_row = [src_width, index_and_lerp1d, &indices_0a, + auto process_edge_row = [src_width, dst_width, index_and_lerp1d, &indices_0a, &indices_0b, &indices_1a, &indices_1b, &indices_2a, &indices_2b, &indices_3a, &indices_3b](const float *src_row, float *dst_row, size_t dst_stride) KSC { + // Left elements + float left = src_row[0]; + float *dst = dst_row; + for (size_t i = 0; i < 4; ++i) { + *dst++ = left; + *dst++ = left; + *dst++ = left; + *dst = left; + dst += dst_stride - 3; + } + + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) { svbool_t pg = svwhilelt_b32(src_x, src_width); svfloat32_t svsrc = svld1_f32(pg, src_row + src_x); @@ -942,6 +926,17 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( svst1_vnum(pg_4, dst_row2, 3, dst); svst1_vnum(pg_4, dst_row3, 3, dst); } + + // Right elements + dst = dst_row + dst_width - 4; + float right = src_row[src_width - 1]; + for (size_t i = 0; i < 4; ++i) { + *dst++ = right; + *dst++ = right; + *dst++ = right; + *dst = right; + dst += dst_stride - 3; + } }; svfloat32_t coeffs_p = svmul_n_f32_x(svptrue_b32(), coeffs_a, 15.0 / 16); @@ -963,12 +958,25 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( svtbl(src1, indices_b), coeffs_s); }; - auto process_row = [src_width, index_and_lerp2d, lerp1d_vector, &indices_0a, - &indices_0b, &indices_1a, &indices_1b, &indices_2a, - &indices_2b, &indices_3a, &indices_3b]( + auto process_row = [src_width, dst_width, index_and_lerp2d, lerp1d_vector, + &indices_0a, &indices_0b, &indices_1a, &indices_1b, + &indices_2a, &indices_2b, &indices_3a, &indices_3b]( const float *src_row0, const float *src_row1, float *dst_row, size_t dst_stride) KLEIDICV_STREAMING_COMPATIBLE { + // Left edge + svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read 1 element + svbool_t pg4 = svptrue_pat_b32(SV_VL4); // write 4 elements + float *dst_lr = dst_row; + svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0); + svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0); + for (size_t i = 0; i < 8; ++i) { + svst1(pg4, dst_lr, + lerp1d_vector(pg4, static_cast(15 - i * 2) / 16.0F, s0l, + static_cast(i * 2 + 1) / 16.0F, s1l)); + dst_lr += dst_stride; + } + // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) { size_t dst_x = src_x * 8 + 4; @@ -1062,53 +1070,26 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( svst1_vnum(pg_4, dst_row6, 3, lerp1d_vector(pg_4, 1.0 / 7, dst_0, 6.0 / 7, dst_7)); } - }; - // Corners - auto set_corner = [dst, dst_stride](size_t left_column, size_t top_row, - float value) KSC { - float *row = dst + dst_stride * top_row + left_column; - for (size_t i = 0; i < 4; ++i) { - for (size_t j = 0; j < 4; ++j) { - row[j] = value; - } - row += dst_stride; - } - }; - set_corner(0, 0, src[0]); - set_corner(dst_width - 4, 0, src[src_width - 1]); - set_corner(0, dst_height - 4, src[src_stride * (src_height - 1)]); - set_corner(dst_width - 4, dst_height - 4, - src[src_stride * (src_height - 1) + src_width - 1]); - - // Left & right edge - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { - float *dst_row = dst + dst_stride * (src_y * 8 + 4); - const float *src_row0 = src + src_stride * src_y; - const float *src_row1 = src_row0 + src_stride; - const float s0l = src_row0[0], s1l = src_row1[0]; - const float s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; - svbool_t pg = svptrue_pat_b32(SV_VL4); // write 4 elements + // Right edge + dst_lr = dst_row; + svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0); + svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0); for (size_t i = 0; i < 8; ++i) { - svst1(pg, dst_row, - lerp1d_vector(pg, static_cast(15 - i * 2) / 16.0F, - svdup_f32_x(pg, s0l), - static_cast(i * 2 + 1) / 16.0F, - svdup_f32_x(pg, s1l))); - svst1(pg, dst_row + dst_width - 4, - lerp1d_vector(pg, static_cast(15 - i * 2) / 16.0F, - svdup_f32_x(pg, s0r), - static_cast(i * 2 + 1) / 16.0F, - svdup_f32_x(pg, s1r))); - dst_row += dst_stride; + svst1(pg4, dst_lr + dst_width - 4, + lerp1d_vector(pg4, static_cast(15 - i * 2) / 16.0F, s0r, + static_cast(i * 2 + 1) / 16.0F, s1r)); + dst_lr += dst_stride; } - } + }; // Top rows - process_edge_row(src, dst, dst_stride); + if (KLEIDICV_LIKELY(y_begin == 0)) { + process_edge_row(src, dst, dst_stride); + } // Middle rows - for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) { size_t dst_y = src_y * 8 + 4; const float *src_row0 = src + src_stride * src_y; const float *src_row1 = src_row0 + src_stride; @@ -1116,16 +1097,18 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc( } // Bottom rows - process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (dst_height - 4), dst_stride); + if (KLEIDICV_LIKELY(y_end == src_height)) { + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 4), dst_stride); + } return KLEIDICV_OK; } -KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_u8_sc( +KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, size_t dst_width, - size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(dst_width, dst_height); @@ -1134,20 +1117,20 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_u8_sc( return KLEIDICV_OK; } if (src_width * 2 == dst_width && src_height * 2 == dst_height) { - return resize_2x2_u8_sc(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_2x2_u8_sc(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } if (src_width * 4 == dst_width && src_height * 4 == dst_height) { - return resize_4x4_u8_sc(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_4x4_u8_sc(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } return KLEIDICV_ERROR_NOT_IMPLEMENTED; } -KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_f32_sc( +KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_f32_sc( const float *src, size_t src_stride, size_t src_width, size_t src_height, - float *dst, size_t dst_stride, size_t dst_width, - size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { + size_t y_begin, size_t y_end, float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) KLEIDICV_STREAMING_COMPATIBLE { CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); CHECK_IMAGE_SIZE(dst_width, dst_height); @@ -1156,20 +1139,21 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_f32_sc( return KLEIDICV_OK; } if (src_width * 2 == dst_width && src_height * 2 == dst_height) { - return resize_2x2_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_2x2_f32_sc(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } if (src_width * 4 == dst_width && src_height * 4 == dst_height) { - return resize_4x4_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_4x4_f32_sc(src, src_stride, src_width, src_height, y_begin, + y_end, dst, dst_stride); } if (src_width * 8 == dst_width && src_height * 8 == dst_height) { if (svcntw() >= 8) { return resize_8x8_f32_sve256plus_sc(src, src_stride, src_width, - src_height, dst, dst_stride); + src_height, y_begin, y_end, dst, + dst_stride); } - return resize_8x8_f32_sve128_sc(src, src_stride, src_width, src_height, dst, - dst_stride); + return resize_8x8_f32_sve128_sc(src, src_stride, src_width, src_height, + y_begin, y_end, dst, dst_stride); } return KLEIDICV_ERROR_NOT_IMPLEMENTED; } diff --git a/kleidicv/src/resize/resize_linear_sme2.cpp b/kleidicv/src/resize/resize_linear_sme2.cpp index 2d326b71b..a308ea2cb 100644 --- a/kleidicv/src/resize/resize_linear_sme2.cpp +++ b/kleidicv/src/resize/resize_linear_sme2.cpp @@ -7,19 +7,23 @@ namespace kleidicv::sme2 { KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t -resize_linear_u8(const uint8_t *src, size_t src_stride, size_t src_width, - size_t src_height, uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { - return resize_linear_u8_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height); +resize_linear_stripe_u8(const uint8_t *src, size_t src_stride, size_t src_width, + size_t src_height, size_t y_begin, size_t y_end, + uint8_t *dst, size_t dst_stride, size_t dst_width, + size_t dst_height) { + return resize_linear_stripe_u8_sc(src, src_stride, src_width, src_height, + y_begin, y_end, dst, dst_stride, dst_width, + dst_height); } KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t -resize_linear_f32(const float *src, size_t src_stride, size_t src_width, - size_t src_height, float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { - return resize_linear_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height); +resize_linear_stripe_f32(const float *src, size_t src_stride, size_t src_width, + size_t src_height, size_t y_begin, size_t y_end, + float *dst, size_t dst_stride, size_t dst_width, + size_t dst_height) { + return resize_linear_stripe_f32_sc(src, src_stride, src_width, src_height, + y_begin, y_end, dst, dst_stride, dst_width, + dst_height); } } // namespace kleidicv::sme2 diff --git a/kleidicv/src/resize/resize_linear_sve2.cpp b/kleidicv/src/resize/resize_linear_sve2.cpp index ea094d2ea..1391740bb 100644 --- a/kleidicv/src/resize/resize_linear_sve2.cpp +++ b/kleidicv/src/resize/resize_linear_sve2.cpp @@ -6,21 +6,22 @@ #include "resize_linear_sc.h" namespace kleidicv::sve2 { -KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, - size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { - return resize_linear_u8_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height); +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t resize_linear_stripe_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) { + return resize_linear_stripe_u8_sc(src, src_stride, src_width, src_height, + y_begin, y_end, dst, dst_stride, dst_width, + dst_height); } -kleidicv_error_t resize_linear_f32(const float *src, size_t src_stride, - size_t src_width, size_t src_height, - float *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { - return resize_linear_f32_sc(src, src_stride, src_width, src_height, dst, - dst_stride, dst_width, dst_height); +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t resize_linear_stripe_f32( + const float *src, size_t src_stride, size_t src_width, size_t src_height, + size_t y_begin, size_t y_end, float *dst, size_t dst_stride, + size_t dst_width, size_t dst_height) { + return resize_linear_stripe_f32_sc(src, src_stride, src_width, src_height, + y_begin, y_end, dst, dst_stride, dst_width, + dst_height); } } // namespace kleidicv::sve2 diff --git a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h index 0e290c4bb..eb5ba5fe1 100644 --- a/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h +++ b/kleidicv_thread/include/kleidicv_thread/kleidicv_thread.h @@ -86,16 +86,28 @@ kleidicv_error_t kleidicv_thread_yuv_sp_to_bgr_u8( size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_yuv_sp_to_bgra_u8( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, bool is_nv21, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_yuv_sp_to_rgba_u8( const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, @@ -169,17 +181,29 @@ kleidicv_error_t kleidicv_thread_min_max_loc_u8( const uint8_t *src, size_t src_stride, size_t width, size_t height, size_t *min_offset, size_t *max_offset, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_threshold_binary_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, uint8_t threshold, uint8_t value, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, float scale, float shift, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride, float *dst, size_t dst_stride, size_t width, size_t height, @@ -233,11 +257,20 @@ KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_bitwise_and, uint8_t); KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_compare_equal_u8, uint8_t); KLEIDICV_THREAD_BINARY_OP(kleidicv_thread_compare_greater_u8, uint8_t); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of +/// kleidicv_saturating_add_abs_with_threshold_s16 - see the documentation of +/// that function for more details. kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16( const int16_t *src_a, size_t src_a_stride, const int16_t *src_b, size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width, size_t height, int16_t threshold, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_gaussian_blur_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_gaussian_blur_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t channels, size_t kernel_width, @@ -245,6 +278,10 @@ kleidicv_error_t kleidicv_thread_gaussian_blur_u8( kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_separable_filter_2d_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_separable_filter_2d_u8( const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, size_t width, size_t height, size_t channels, const uint8_t *kernel_x, @@ -252,6 +289,10 @@ kleidicv_error_t kleidicv_thread_separable_filter_2d_u8( kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_separable_filter_2d_u16 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_separable_filter_2d_u16( const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, size_t width, size_t height, size_t channels, const uint16_t *kernel_x, @@ -259,6 +300,10 @@ kleidicv_error_t kleidicv_thread_separable_filter_2d_u16( kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_sobel_3x3_horizontal_s16_u8 - see +/// the documentation of that function for more details. kleidicv_error_t kleidicv_thread_separable_filter_2d_s16( const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, size_t width, size_t height, size_t channels, const int16_t *kernel_x, @@ -266,16 +311,51 @@ kleidicv_error_t kleidicv_thread_separable_filter_2d_s16( kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_sobel_3x3_horizontal_s16_u8 - see +/// the documentation of that function for more details. kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8( const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, size_t width, size_t height, size_t channels, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_sobel_3x3_vertical_s16_u8 - see the +/// documentation of that function for more details. kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8( const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, size_t width, size_t height, size_t channels, kleidicv_thread_multithreading); +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_to_quarter_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_resize_to_quarter_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_linear_u8 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_resize_linear_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + kleidicv_thread_multithreading); + +/// Internal - not part of the public API and its direct use is not supported. +/// +/// Multithreaded implementation of kleidicv_resize_linear_f32 - see the +/// documentation of that function for more details. +kleidicv_error_t kleidicv_thread_resize_linear_f32( + const float *src, size_t src_stride, size_t src_width, size_t src_height, + float *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + kleidicv_thread_multithreading); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/kleidicv_thread/src/kleidicv_thread.cpp b/kleidicv_thread/src/kleidicv_thread.cpp index 7b9a7e493..e817530b5 100644 --- a/kleidicv_thread/src/kleidicv_thread.cpp +++ b/kleidicv_thread/src/kleidicv_thread.cpp @@ -13,6 +13,7 @@ #include "kleidicv/filters/separable_filter_2d.h" #include "kleidicv/filters/sobel.h" #include "kleidicv/kleidicv.h" +#include "kleidicv/resize/resize_linear.h" typedef std::function FunctionCallback; @@ -502,3 +503,55 @@ kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8( return mt.parallel(kleidicv_thread_std_function_callback, &callback, mt.parallel_data, height); } + +kleidicv_error_t kleidicv_thread_resize_to_quarter_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + kleidicv_thread_multithreading mt) { + FunctionCallback callback = [=](unsigned task_begin, unsigned task_end) { + size_t src_begin = size_t{task_begin} * 2; + size_t src_end = std::min(src_height, size_t{task_end} * 2); + size_t dst_begin = task_begin; + size_t dst_end = std::min(dst_height, task_end); + + // half of odd height is rounded towards zero? + if (dst_begin == dst_end) { + return KLEIDICV_OK; + } + + return kleidicv_resize_to_quarter_u8( + src + src_begin * src_stride, src_stride, src_width, + src_end - src_begin, dst + dst_begin * dst_stride, dst_stride, + dst_width, dst_end - dst_begin); + }; + return mt.parallel(kleidicv_thread_std_function_callback, &callback, + mt.parallel_data, (src_height + 1) / 2); +} + +kleidicv_error_t kleidicv_thread_resize_linear_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + kleidicv_thread_multithreading mt) { + FunctionCallback callback = [=](unsigned y_begin, unsigned y_end) { + return kleidicv_resize_linear_stripe_u8( + src, src_stride, src_width, src_height, y_begin, + std::min(src_height, y_end + 1), dst, dst_stride, dst_width, + dst_height); + }; + return mt.parallel(kleidicv_thread_std_function_callback, &callback, + mt.parallel_data, std::max(1, src_height - 1)); +} + +kleidicv_error_t kleidicv_thread_resize_linear_f32( + const float *src, size_t src_stride, size_t src_width, size_t src_height, + float *dst, size_t dst_stride, size_t dst_width, size_t dst_height, + kleidicv_thread_multithreading mt) { + FunctionCallback callback = [=](unsigned y_begin, unsigned y_end) { + return kleidicv_resize_linear_stripe_f32( + src, src_stride, src_width, src_height, y_begin, + std::min(src_height, y_end + 1), dst, dst_stride, dst_width, + dst_height); + }; + return mt.parallel(kleidicv_thread_std_function_callback, &callback, + mt.parallel_data, std::max(1, src_height - 1)); +} diff --git a/scripts/ci.sh b/scripts/ci.sh index 5fb369bb4..e9b12ec71 100755 --- a/scripts/ci.sh +++ b/scripts/ci.sh @@ -53,7 +53,7 @@ CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake -S . -B build/gcc -G Ni ninja -C build/gcc # Run tests -LONG_VECTOR_TESTS="GRAY2.*:RGB*:Yuv*:Rgb*" +LONG_VECTOR_TESTS="GRAY2.*:RGB*:Yuv*:Rgb*:Resize*" TESTRESULT=0 qemu-aarch64 build/test/framework/kleidicv-framework-test --gtest_output=xml:build/test-results/ || TESTRESULT=1 qemu-aarch64 -cpu cortex-a35 build/test/api/kleidicv-api-test --gtest_output=xml:build/test-results/clang-neon/ || TESTRESULT=1 diff --git a/test/api/test_resize_linear.cpp b/test/api/test_resize_linear.cpp index 9d5f1df64..e4cd0aa2d 100644 --- a/test/api/test_resize_linear.cpp +++ b/test/api/test_resize_linear.cpp @@ -218,6 +218,12 @@ TYPED_TEST(ResizeLinear, NotImplemented) { EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, kleidicv_resize_linear(src, sizeof(TypeParam), 1, 1, dst, sizeof(TypeParam) * 2, 2, 4)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + kleidicv_resize_linear(src, sizeof(TypeParam), 1, 1, dst, + sizeof(TypeParam) * 2, 8, 4)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + kleidicv_resize_linear(src, sizeof(TypeParam), 1, 1, dst, + sizeof(TypeParam) * 2, 4, 8)); } TYPED_TEST(ResizeLinear, NullPointer) { diff --git a/test/api/test_thread_resize.cpp b/test/api/test_thread_resize.cpp new file mode 100644 index 000000000..460e21d54 --- /dev/null +++ b/test/api/test_thread_resize.cpp @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include + +#include "framework/array.h" +#include "framework/generator.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv_thread/kleidicv_thread.h" +#include "multithreading_fake.h" + +// Tuple of width, height, thread count. +typedef std::tuple P; + +class ResizeThread : public testing::TestWithParam

{ + public: + void test_resize_to_quarter() { + size_t src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + check(kleidicv_resize_to_quarter_u8, + kleidicv_thread_resize_to_quarter_u8, thread_count, + src_width, src_height, src_width / 2, src_height / 2); + check(kleidicv_resize_to_quarter_u8, + kleidicv_thread_resize_to_quarter_u8, thread_count, + src_width, src_height, (src_width + 1) / 2, + (src_height + 1) / 2); + } + void test_resize_u8_2x2() { + size_t src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + check(kleidicv_resize_linear_u8, kleidicv_thread_resize_linear_u8, + thread_count, src_width, src_height, 2 * src_width, + 2 * src_height); + } + void test_resize_u8_4x4() { + size_t src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + check(kleidicv_resize_linear_u8, kleidicv_thread_resize_linear_u8, + thread_count, src_width, src_height, 4 * src_width, + 4 * src_height); + } + void test_resize_f32_2x2() { + size_t src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + check(kleidicv_resize_linear_f32, kleidicv_thread_resize_linear_f32, + thread_count, src_width, src_height, 2 * src_width, + 2 * src_height); + } + void test_resize_f32_4x4() { + size_t src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + check(kleidicv_resize_linear_f32, kleidicv_thread_resize_linear_f32, + thread_count, src_width, src_height, 4 * src_width, + 4 * src_height); + } + void test_resize_f32_8x8() { + size_t src_width = 0, src_height = 0, thread_count = 0; + std::tie(src_width, src_height, thread_count) = GetParam(); + check(kleidicv_resize_linear_f32, kleidicv_thread_resize_linear_f32, + thread_count, src_width, src_height, 8 * src_width, + 8 * src_height); + } + + private: + template + static void check(SingleThreadedFunc single_threaded_func, + MultithreadedFunc multithreaded_func, size_t thread_count, + size_t src_width, size_t src_height, size_t dst_width, + size_t dst_height) { + test::Array2D src(src_width, src_height), + dst_single(dst_width, dst_height), dst_multi(dst_width, dst_height); + test::PseudoRandomNumberGenerator generator; + src.fill(generator); + dump(&src); + + kleidicv_error_t single_result = single_threaded_func( + src.data(), src.stride(), src_width, src_height, dst_single.data(), + dst_single.stride(), dst_width, dst_height); + + kleidicv_error_t multi_result = + multithreaded_func(src.data(), src.stride(), src_width, src_height, + dst_multi.data(), dst_multi.stride(), dst_width, + dst_height, get_multithreading_fake(thread_count)); + + EXPECT_EQ(KLEIDICV_OK, single_result); + EXPECT_EQ(KLEIDICV_OK, multi_result); + EXPECT_EQ_ARRAY2D(dst_single, dst_multi); + } +}; + +TEST_P(ResizeThread, ResizeToQuarter) { test_resize_to_quarter(); } + +TEST_P(ResizeThread, ResizeUint2x2) { test_resize_u8_2x2(); } + +TEST_P(ResizeThread, ResizeUint4x4) { test_resize_u8_4x4(); } + +TEST_P(ResizeThread, ResizeFloat2x2) { test_resize_f32_2x2(); } + +TEST_P(ResizeThread, ResizeFloat4x4) { test_resize_f32_4x4(); } + +TEST_P(ResizeThread, ResizeFloat8x8) { test_resize_f32_8x8(); } + +INSTANTIATE_TEST_SUITE_P(, ResizeThread, + testing::Values(P{1, 1, 1}, P{1, 2, 1}, P{1, 2, 2}, + P{2, 1, 2}, P{2, 2, 1}, P{1, 3, 2}, + P{2, 3, 1}, P{6, 4, 1}, P{4, 5, 2}, + P{2, 6, 3}, P{1, 7, 4}, P{12, 34, 5})); -- GitLab From 9ca5d84dd67624a2218a173cd25d3d700b835e51 Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Fri, 16 Aug 2024 16:06:25 +0000 Subject: [PATCH 2/3] Multithread resize: all process_row functions use only SVE --- kleidicv/src/resize/resize_linear_neon.cpp | 121 ++++++++++++--------- kleidicv/src/resize/resize_linear_sc.h | 106 +++++++++--------- 2 files changed, 122 insertions(+), 105 deletions(-) diff --git a/kleidicv/src/resize/resize_linear_neon.cpp b/kleidicv/src/resize/resize_linear_neon.cpp index e3a652870..c9df0eb77 100644 --- a/kleidicv/src/resize/resize_linear_neon.cpp +++ b/kleidicv/src/resize/resize_linear_neon.cpp @@ -771,6 +771,10 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( return vmlaq_n_f32(vmulq_n_f32(a, p), b, q); }; + auto lerp1d_vector_n2 = [](float32x4_t a, float q, float32x4_t b) { + return vmlaq_n_f32(a, b, q); + }; + auto lerp1d_vector = [](float32x4_t p, float32x4_t a, float32x4_t q, float32x4_t b) { return vmlaq_f32(vmulq_f32(a, p), b, q); @@ -840,24 +844,19 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( return vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(a, p), b, q), c, r), d, s); }; - auto process_row = [src_width, dst_width, lerp2d_vector, lerp1d_vector_n, - &coeffs_p0, &coeffs_q0, &coeffs_r0, &coeffs_s0, - &coeffs_p1, &coeffs_q1, &coeffs_r1, + auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n, + lerp1d_vector_n2, &coeffs_p0, &coeffs_q0, &coeffs_r0, + &coeffs_s0, &coeffs_p1, &coeffs_q1, &coeffs_r1, &coeffs_s1](const float *src_row0, const float *src_row1, float *dst_row0, size_t dst_stride) { - // Left & right elements - const float s0l = src_row0[0], s1l = src_row1[0]; - const float s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + // Left elements + float32x4_t s0 = vdupq_n_f32(src_row0[0]); + float32x4_t s1 = vdupq_n_f32(src_row1[0]); float *dst_row = dst_row0; for (size_t i = 0; i < 8; ++i) { - vst1q(dst_row, lerp1d_vector_n(static_cast(15 - i * 2) / 16.0F, - vdupq_n_f32(s0l), - static_cast(i * 2 + 1) / 16.0F, - vdupq_n_f32(s1l))); - vst1q(dst_row + dst_width - 4, - lerp1d_vector_n( - static_cast(15 - i * 2) / 16.0F, vdupq_n_f32(s0r), - static_cast(i * 2 + 1) / 16.0F, vdupq_n_f32(s1r))); + vst1q(dst_row, + lerp1d_vector_n(static_cast(15 - i * 2) / 16.0F, s0, + static_cast(i * 2 + 1) / 16.0F, s1)); dst_row += dst_stride; } @@ -870,53 +869,69 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32( float *dst_row5 = dst_row4 + dst_stride; float *dst_row6 = dst_row5 + dst_stride; float *dst_row7 = dst_row6 + dst_stride; - float32x4_t a, b = vdupq_n_f32(src_row0[0]); - float32x4_t c, d = vdupq_n_f32(src_row1[0]); + float32x4_t a, b = s0; + float32x4_t c, d = s1; for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { a = b; b = vdupq_n_f32(src_row0[src_x + 1]); c = d; d = vdupq_n_f32(src_row1[src_x + 1]); - float32x4_t dst_0 = + float32x4x2_t dst_0; + dst_0.val[0] = lerp2d_vector(coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d); - vst1q(dst_row0, dst_0); - float32x4_t dst_7 = - lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d); - vst1q(dst_row7, dst_7); - vst1q(dst_row1, lerp1d_vector_n(6.0 / 7, dst_0, 1.0 / 7, dst_7)); - vst1q(dst_row2, lerp1d_vector_n(5.0 / 7, dst_0, 2.0 / 7, dst_7)); - vst1q(dst_row3, lerp1d_vector_n(4.0 / 7, dst_0, 3.0 / 7, dst_7)); - vst1q(dst_row4, lerp1d_vector_n(3.0 / 7, dst_0, 4.0 / 7, dst_7)); - vst1q(dst_row5, lerp1d_vector_n(2.0 / 7, dst_0, 5.0 / 7, dst_7)); - vst1q(dst_row6, lerp1d_vector_n(1.0 / 7, dst_0, 6.0 / 7, dst_7)); - dst_row0 += 4; - dst_row1 += 4; - dst_row2 += 4; - dst_row3 += 4; - dst_row4 += 4; - dst_row5 += 4; - dst_row6 += 4; - dst_row7 += 4; - dst_0 = + dst_0.val[1] = lerp2d_vector(coeffs_p1, a, coeffs_q1, b, coeffs_r1, c, coeffs_s1, d); - vst1q(dst_row0, dst_0); - dst_7 = + vst1q_x2(dst_row0, dst_0); + + float32x4x2_t dst_7; + dst_7.val[0] = + lerp2d_vector(coeffs_r0, a, coeffs_s0, b, coeffs_p0, c, coeffs_q0, d); + dst_7.val[1] = lerp2d_vector(coeffs_r1, a, coeffs_s1, b, coeffs_p1, c, coeffs_q1, d); - vst1q(dst_row7, dst_7); - vst1q(dst_row1, lerp1d_vector_n(6.0 / 7, dst_0, 1.0 / 7, dst_7)); - vst1q(dst_row2, lerp1d_vector_n(5.0 / 7, dst_0, 2.0 / 7, dst_7)); - vst1q(dst_row3, lerp1d_vector_n(4.0 / 7, dst_0, 3.0 / 7, dst_7)); - vst1q(dst_row4, lerp1d_vector_n(3.0 / 7, dst_0, 4.0 / 7, dst_7)); - vst1q(dst_row5, lerp1d_vector_n(2.0 / 7, dst_0, 5.0 / 7, dst_7)); - vst1q(dst_row6, lerp1d_vector_n(1.0 / 7, dst_0, 6.0 / 7, dst_7)); - dst_row0 += 4; - dst_row1 += 4; - dst_row2 += 4; - dst_row3 += 4; - dst_row4 += 4; - dst_row5 += 4; - dst_row6 += 4; - dst_row7 += 4; + vst1q_x2(dst_row7, dst_7); + + float32x4_t delta07_0 = vsubq_f32(dst_7.val[0], dst_0.val[0]); + float32x4_t delta07_1 = vsubq_f32(dst_7.val[1], dst_0.val[1]); + + float32x4x2_t dst; + dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 1.0 / 7, delta07_0); + dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 1.0 / 7, delta07_1); + vst1q_x2(dst_row1, dst); + dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 2.0 / 7, delta07_0); + dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 2.0 / 7, delta07_1); + vst1q_x2(dst_row2, dst); + dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 3.0 / 7, delta07_0); + dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 3.0 / 7, delta07_1); + vst1q_x2(dst_row3, dst); + dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 4.0 / 7, delta07_0); + dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 4.0 / 7, delta07_1); + vst1q_x2(dst_row4, dst); + dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 5.0 / 7, delta07_0); + dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 5.0 / 7, delta07_1); + vst1q_x2(dst_row5, dst); + dst.val[0] = lerp1d_vector_n2(dst_0.val[0], 6.0 / 7, delta07_0); + dst.val[1] = lerp1d_vector_n2(dst_0.val[1], 6.0 / 7, delta07_1); + vst1q_x2(dst_row6, dst); + + dst_row0 += 8; + dst_row1 += 8; + dst_row2 += 8; + dst_row3 += 8; + dst_row4 += 8; + dst_row5 += 8; + dst_row6 += 8; + dst_row7 += 8; + } + + // Right elements + s0 = b; + s1 = d; + dst_row = dst_row0; + for (size_t i = 0; i < 8; ++i) { + vst1q(dst_row, + lerp1d_vector_n(static_cast(15 - i * 2) / 16.0F, s0, + static_cast(i * 2 + 1) / 16.0F, s1)); + dst_row += dst_stride; } }; diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index 920f64da6..57067176c 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -19,10 +19,6 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( size_t dst_width = src_width * 2; size_t dst_height = src_height * 2; - auto lerp1d_scalar = - [](uint8_t near, uint8_t far) - KLEIDICV_STREAMING_COMPATIBLE { return (near * 3 + far + 2) >> 2; }; - auto lerp1d_vector = [](svuint8_t near, svuint8_t far) KLEIDICV_STREAMING_COMPATIBLE { // near * 3 @@ -100,19 +96,18 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( } }; - auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const uint8_t *src_row0, const uint8_t *src_row1, uint8_t *dst_row0, uint8_t *dst_row1) KLEIDICV_STREAMING_COMPATIBLE { - // Left edge - dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); - dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); - - // Right edge - dst_row0[dst_width - 1] = - lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); - dst_row1[dst_width - 1] = - lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); + // Left elements + svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read/write 1 element + { + svuint8_t s0 = svld1(pg1, src_row0); + svuint8_t s1 = svld1(pg1, src_row1); + svst1(pg1, dst_row0, lerp1d_vector(s0, s1)); + svst1(pg1, dst_row1, lerp1d_vector(s1, s0)); + } // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { @@ -133,6 +128,12 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc( svst2_u8(pg, dst_row0 + dst_x, svcreate2(dst_tl, dst_tr)); svst2_u8(pg, dst_row1 + dst_x, svcreate2(dst_bl, dst_br)); } + + // Right elements + svuint8_t s0 = svld1(pg1, src_row0 + src_width - 1); + svuint8_t s1 = svld1(pg1, src_row1 + src_width - 1); + svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(s0, s1)); + svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(s1, s0)); }; // Top row @@ -167,10 +168,6 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( size_t dst_width = src_width * 4; size_t dst_height = src_height * 4; - auto lerp1d_scalar = - [](uint8_t p, uint8_t a, uint8_t q, uint8_t b) - KLEIDICV_STREAMING_COMPATIBLE { return (p * a + q * b + 4) >> 3; }; - auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b) KSC { // bias svuint16_t top = svdup_u16(4); @@ -236,26 +233,22 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( } }; - auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const uint8_t *src_row0, const uint8_t *src_row1, uint8_t *dst_row0, uint8_t *dst_row1, uint8_t *dst_row2, uint8_t *dst_row3) KLEIDICV_STREAMING_COMPATIBLE { // Left elements - const uint8_t s0l = src_row0[0], s1l = src_row1[0]; - dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); - dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); - dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); - dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); - - // Right elements - const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; - const size_t dr0 = dst_width - 2; - const size_t dr1 = dst_width - 1; - dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); - dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); - dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); - dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); + svbool_t pg1 = svptrue_pat_b8(SV_VL1); // read 1 element + svbool_t pg2 = svptrue_pat_b8(SV_VL2); // write 2 elements + { + svuint8_t s0 = svdup_lane(svld1(pg1, src_row0), 0); + svuint8_t s1 = svdup_lane(svld1(pg1, src_row1), 0); + svst1(pg2, dst_row0, lerp1d_vector(7, s0, 1, s1)); + svst1(pg2, dst_row1, lerp1d_vector(5, s0, 3, s1)); + svst1(pg2, dst_row2, lerp1d_vector(3, s0, 5, s1)); + svst1(pg2, dst_row3, lerp1d_vector(1, s0, 7, s1)); + } // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { @@ -290,6 +283,14 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc( lerp2d_vector(3, a, 5, b, 21, c, 35, d), lerp2d_vector(49, d, 7, b, 7, c, 1, a)))); } + + // Right elements + svuint8_t s0 = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0); + svuint8_t s1 = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0); + svst1(pg2, dst_row0 + dst_width - 2, lerp1d_vector(7, s0, 1, s1)); + svst1(pg2, dst_row1 + dst_width - 2, lerp1d_vector(5, s0, 3, s1)); + svst1(pg2, dst_row2 + dst_width - 2, lerp1d_vector(3, s0, 5, s1)); + svst1(pg2, dst_row3 + dst_width - 2, lerp1d_vector(1, s0, 7, s1)); }; auto copy_dst_row = [src_width](const uint8_t *dst_from, @@ -338,10 +339,6 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( src_stride /= sizeof(float); dst_stride /= sizeof(float); - auto lerp1d_scalar = [](float near, float far) KLEIDICV_STREAMING_COMPATIBLE { - return near * 0.75F + far * 0.25F; - }; - auto lerp1d_vector = [](svbool_t pg, svfloat32_t near, svfloat32_t far) KLEIDICV_STREAMING_COMPATIBLE { return svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.75F), far, 0.25F); @@ -385,12 +382,17 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( dst_row[dst_width - 1] = src_row[src_width - 1]; }; - auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_vector]( + auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector]( const float *src_row0, const float *src_row1, float *dst_row0, float *dst_row1) KSC { - // Left element - dst_row0[0] = lerp1d_scalar(src_row0[0], src_row1[0]); - dst_row1[0] = lerp1d_scalar(src_row1[0], src_row0[0]); + // Left elements + svbool_t pg1 = svptrue_pat_b32(SV_VL1); // read/write 1 element + { + svfloat32_t s0 = svld1(pg1, src_row0); + svfloat32_t s1 = svld1(pg1, src_row1); + svst1(pg1, dst_row0, lerp1d_vector(pg1, s0, s1)); + svst1(pg1, dst_row1, lerp1d_vector(pg1, s1, s0)); + } // Middle elements for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) { @@ -411,11 +413,11 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc( lerp2d_vector(pg, d, b, c, a))); } - // Right element - dst_row0[dst_width - 1] = - lerp1d_scalar(src_row0[src_width - 1], src_row1[src_width - 1]); - dst_row1[dst_width - 1] = - lerp1d_scalar(src_row1[src_width - 1], src_row0[src_width - 1]); + // Right elements + svfloat32_t s0 = svld1(pg1, src_row0 + src_width - 1); + svfloat32_t s1 = svld1(pg1, src_row1 + src_width - 1); + svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(pg1, s0, s1)); + svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(pg1, s1, s0)); }; // Top row @@ -720,13 +722,13 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( float *dst_row5 = dst_row4 + dst_stride; float *dst_row6 = dst_row5 + dst_stride; float *dst_row7 = dst_row6 + dst_stride; - svfloat32_t a, b = svdup_n_f32(src_row0[0]); - svfloat32_t c, d = svdup_n_f32(src_row1[0]); + svfloat32_t a, b = s0l; + svfloat32_t c, d = s1l; for (size_t src_x = 0; src_x + 1 < src_width; src_x++) { a = b; - b = svdup_n_f32(src_row0[src_x + 1]); + b = svdup_lane(svld1(pg1, src_row0 + src_x + 1), 0); c = d; - d = svdup_n_f32(src_row1[src_x + 1]); + d = svdup_lane(svld1(pg1, src_row1 + src_x + 1), 0); svfloat32_t dst_0 = lerp2d_vector(svptrue_b32(), coeffs_p0, a, coeffs_q0, b, coeffs_r0, c, coeffs_s0, d); svst1(svptrue_b32(), dst_row0, dst_0); @@ -783,8 +785,8 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc( // Right elements dst_lr = dst_row0; - svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0); - svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0); + svfloat32_t s0r = b; + svfloat32_t s1r = d; for (size_t i = 0; i < 8; ++i) { svst1(pg4, dst_lr, lerp1d_vector_n(pg4, static_cast(15 - i * 2) / 16.0F, s0r, -- GitLab From 94a57ffaa7ec7d3afd74567eee1d0bd2228e490d Mon Sep 17 00:00:00 2001 From: Denes Tarjan Date: Fri, 30 Aug 2024 09:59:30 +0000 Subject: [PATCH 3/3] Multithread resize: don't enable for 4x4 and 8x8 float --- adapters/opencv/kleidicv_hal.cpp | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 8c169eae3..a9f106e47 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -777,10 +777,28 @@ int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, get_multithreading())); case CV_32F: - return convert_error(kleidicv_thread_resize_linear_f32( - reinterpret_cast(src_data), src_step, src_width, - src_height, reinterpret_cast(dst_data), dst_step, dst_width, - dst_height, get_multithreading())); + if (inv_scale_x <= 2.1 && inv_scale_y <= 2.1) { + return convert_error(kleidicv_thread_resize_linear_f32( + reinterpret_cast(src_data), src_step, src_width, + src_height, reinterpret_cast(dst_data), dst_step, + dst_width, dst_height, get_multithreading())); + } else { + // Bigger resize algorithms (4x4 and 8x8) don't perform well with + // multiple threads +#if KLEIDICV_ENABLE_ALL_OPENCV_HAL + return convert_error(kleidicv_thread_resize_linear_f32( + reinterpret_cast(src_data), src_step, src_width, + src_height, reinterpret_cast(dst_data), dst_step, + dst_width, dst_height, get_multithreading())); +#else + if (cv::getNumThreads() == 1) { + return convert_error(kleidicv_resize_linear_f32( + reinterpret_cast(src_data), src_step, src_width, + src_height, reinterpret_cast(dst_data), dst_step, + dst_width, dst_height)); + } +#endif // KLEIDICV_ENABLE_ALL_OPENCV_HAL + } } return CV_HAL_ERROR_NOT_IMPLEMENTED; } -- GitLab