diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index 4a65e5e20d2582be7089f98c35293d6427393cf0..c2fff2edf40d0eecf30f98526305b60322b5afb0 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -494,10 +494,6 @@ int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, kleidicv_resize_linear_u8(src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height)); case CV_32F: - // 4*4 performance uplift is inconsistent so don't use it. - if (src_width * 4 == dst_width && src_height * 4 == dst_height) { - return CV_HAL_ERROR_NOT_IMPLEMENTED; - } return convert_error(kleidicv_resize_linear_f32( reinterpret_cast(src_data), src_step, src_width, src_height, reinterpret_cast(dst_data), dst_step, dst_width, diff --git a/doc/opencv.md b/doc/opencv.md index ea7b979362216bccbe19dd56ab07f5ba9734cf44..a1110198680df76fc9391b9b6b09c96bffc1a19c 100644 --- a/doc/opencv.md +++ b/doc/opencv.md @@ -145,7 +145,7 @@ Release context set up by [`morphology_init`](#morphology_init). ### `resize` Notes on parameters: * In-place operation not supported. -* `src_type` - only supports `CV_8UC1` or, for 2*2 resize only, `CV_32FC1`. +* `src_type` - only supports `CV_8UC1` or `CV_32FC1`, relative sizes can be 0.5x0.5 (`CV_8UC1` only), 2x2 and 4x4. * `dst_width`,`dst_height` - must both be the same multiple of `src_width` and `src_height` respectively, and that multiple must be either 0.5, 2 or 4. * `inv_scale_x`,`inv_scale_y` - must be 0 or `dst_width / src_width`. * `interpolation` - Must be `INTER_LINEAR` or `INTER_AREA` (0.5 by 0.5 only). diff --git a/kleidicv/src/resize/resize_linear_neon.cpp b/kleidicv/src/resize/resize_linear_neon.cpp index 246eb7446b9fb03f4fda7dbc43cc3f3add3d3fa5..5f216d3a1259ec9381a7f3a3fc5fe8b87dbbd101 100644 --- a/kleidicv/src/resize/resize_linear_neon.cpp +++ b/kleidicv/src/resize/resize_linear_neon.cpp @@ -605,10 +605,10 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( } }; - auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, - lerp2d_vector](const T *src_row0, const T *src_row1, - T *dst_row0, T *dst_row1, T *dst_row2, - T *dst_row3) { + auto process_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector, + lerp2d_scalar, lerp2d_vector]( + const T *src_row0, const T *src_row1, T *dst_row0, + T *dst_row1, T *dst_row2, T *dst_row3) { // Left elements const T s0l = src_row0[0], s1l = src_row1[0]; dst_row0[0] = dst_row0[1] = lerp1d_scalar(0.875F, s0l, 0.125F, s1l); @@ -636,46 +636,36 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32( float32x4_t c = vld1q_f32(src_row1 + src_x); float32x4_t d = vld1q_f32(src_row1 + src_x + 1); - vst4q_f32(dst_row0 + dst_x, (float32x4x4_t{ - lerp2d_vector(0.765625F, a, 0.109375F, b, - 0.109375F, c, 0.015625F, d), - lerp2d_vector(0.546875F, a, 0.328125F, b, - 0.078125F, c, 0.046875F, d), - lerp2d_vector(0.328125F, a, 0.546875F, b, - 0.046875F, c, 0.078125F, d), - lerp2d_vector(0.109375F, a, 0.765625F, b, - 0.015625F, c, 0.109375F, d), - })); - vst4q_f32(dst_row1 + dst_x, (float32x4x4_t{ - lerp2d_vector(0.546875F, a, 0.078125F, b, - 0.328125F, c, 0.046875F, d), - lerp2d_vector(0.390625F, a, 0.234375F, b, - 0.234375F, c, 0.140625F, d), - lerp2d_vector(0.234375F, a, 0.390625F, b, - 0.140625F, c, 0.234375F, d), - lerp2d_vector(0.078125F, a, 0.546875F, b, - 0.046875F, c, 0.328125F, d), - })); - vst4q_f32(dst_row2 + dst_x, (float32x4x4_t{ - lerp2d_vector(0.328125F, a, 0.046875F, b, - 0.546875F, c, 0.078125F, d), - lerp2d_vector(0.234375F, a, 0.140625F, b, - 0.390625F, c, 0.234375F, d), - lerp2d_vector(0.140625F, a, 0.234375F, b, - 0.234375F, c, 0.390625F, d), - lerp2d_vector(0.046875F, a, 0.328125F, b, - 0.078125F, c, 0.546875F, d), - })); - vst4q_f32(dst_row3 + dst_x, (float32x4x4_t{ - lerp2d_vector(0.109375F, a, 0.015625F, b, - 0.765625F, c, 0.109375F, d), - lerp2d_vector(0.078125F, a, 0.046875F, b, - 0.546875F, c, 0.328125F, d), - lerp2d_vector(0.046875F, a, 0.078125F, b, - 0.328125F, c, 0.546875F, d), - lerp2d_vector(0.015625F, a, 0.109375F, b, - 0.109375F, c, 0.765625F, d), - })); + float32x4x4_t dst_a{ + lerp2d_vector(0.765625F, a, 0.109375F, b, 0.109375F, c, 0.015625F, d), + lerp2d_vector(0.546875F, a, 0.328125F, b, 0.078125F, c, 0.046875F, d), + lerp2d_vector(0.328125F, a, 0.546875F, b, 0.046875F, c, 0.078125F, d), + lerp2d_vector(0.109375F, a, 0.765625F, b, 0.015625F, c, 0.109375F, d), + }; + float32x4x4_t dst_d{ + lerp2d_vector(0.109375F, a, 0.015625F, b, 0.765625F, c, 0.109375F, d), + lerp2d_vector(0.078125F, a, 0.046875F, b, 0.546875F, c, 0.328125F, d), + lerp2d_vector(0.046875F, a, 0.078125F, b, 0.328125F, c, 0.546875F, d), + lerp2d_vector(0.015625F, a, 0.109375F, b, 0.109375F, c, 0.765625F, d), + }; + const float one_3rd = 0.3333333333333333F; + const float two_3rd = 0.6666666666666667F; + vst4q_f32(dst_row0 + dst_x, dst_a); + vst4q_f32(dst_row1 + dst_x, + (float32x4x4_t{ + lerp1d_vector(two_3rd, dst_a.val[0], one_3rd, dst_d.val[0]), + lerp1d_vector(two_3rd, dst_a.val[1], one_3rd, dst_d.val[1]), + lerp1d_vector(two_3rd, dst_a.val[2], one_3rd, dst_d.val[2]), + lerp1d_vector(two_3rd, dst_a.val[3], one_3rd, dst_d.val[3]), + })); + vst4q_f32(dst_row2 + dst_x, + (float32x4x4_t{ + lerp1d_vector(one_3rd, dst_a.val[0], two_3rd, dst_d.val[0]), + lerp1d_vector(one_3rd, dst_a.val[1], two_3rd, dst_d.val[1]), + lerp1d_vector(one_3rd, dst_a.val[2], two_3rd, dst_d.val[2]), + lerp1d_vector(one_3rd, dst_a.val[3], two_3rd, dst_d.val[3]), + })); + vst4q_f32(dst_row3 + dst_x, dst_d); } for (; src_x + 1 < src_width; ++src_x) { diff --git a/kleidicv/src/resize/resize_linear_sc.h b/kleidicv/src/resize/resize_linear_sc.h index da30d2ec4cbfa68717adb5719af5591e48a0173f..811ca965aed18340e8013d9cd1bb3a5a4e4e3ed7 100644 --- a/kleidicv/src/resize/resize_linear_sc.h +++ b/kleidicv/src/resize/resize_linear_sc.h @@ -506,7 +506,7 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( } }; - auto process_row = [src_width, lerp2d_vector]( + auto process_row = [src_width, lerp1d_vector, lerp2d_vector]( const float *src_row0, const float *src_row1, float *dst_row0, float *dst_row1, float *dst_row2, float *dst_row3) KLEIDICV_STREAMING_COMPATIBLE { @@ -521,43 +521,46 @@ KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc( svfloat32_t c = svld1_f32(pg, src_row1 + src_x); svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1); - svst4_f32(pg, dst_row0 + dst_x, - (svcreate4(lerp2d_vector(pg, 0.765625F, a, 0.109375F, b, - 0.109375F, c, 0.015625F, d), - lerp2d_vector(pg, 0.546875F, a, 0.328125F, b, - 0.078125F, c, 0.046875F, d), - lerp2d_vector(pg, 0.328125F, a, 0.546875F, b, - 0.046875F, c, 0.078125F, d), - lerp2d_vector(pg, 0.109375F, a, 0.765625F, b, - 0.015625F, c, 0.109375F, d)))); - + svfloat32x4_t dst_a = + svcreate4(lerp2d_vector(pg, 0.765625F, a, 0.109375F, b, 0.109375F, c, + 0.015625F, d), + lerp2d_vector(pg, 0.546875F, a, 0.328125F, b, 0.078125F, c, + 0.046875F, d), + lerp2d_vector(pg, 0.328125F, a, 0.546875F, b, 0.046875F, c, + 0.078125F, d), + lerp2d_vector(pg, 0.109375F, a, 0.765625F, b, 0.015625F, c, + 0.109375F, d)); + svfloat32x4_t dst_d = + svcreate4(lerp2d_vector(pg, 0.109375F, a, 0.015625F, b, 0.765625F, c, + 0.109375F, d), + lerp2d_vector(pg, 0.078125F, a, 0.046875F, b, 0.546875F, c, + 0.328125F, d), + lerp2d_vector(pg, 0.046875F, a, 0.078125F, b, 0.328125F, c, + 0.546875F, d), + lerp2d_vector(pg, 0.015625F, a, 0.109375F, b, 0.109375F, c, + 0.765625F, d)); + const float one_3rd = 0.3333333333333333F; + const float two_3rd = 0.6666666666666667F; + svst4_f32(pg, dst_row0 + dst_x, dst_a); svst4_f32(pg, dst_row1 + dst_x, - (svcreate4(lerp2d_vector(pg, 0.546875F, a, 0.078125F, b, - 0.328125F, c, 0.046875F, d), - lerp2d_vector(pg, 0.390625F, a, 0.234375F, b, - 0.234375F, c, 0.140625F, d), - lerp2d_vector(pg, 0.234375F, a, 0.390625F, b, - 0.140625F, c, 0.234375F, d), - lerp2d_vector(pg, 0.078125F, a, 0.546875F, b, - 0.046875F, c, 0.328125F, d)))); + svcreate4(lerp1d_vector(pg, two_3rd, svget4(dst_a, 0), one_3rd, + svget4(dst_d, 0)), + lerp1d_vector(pg, two_3rd, svget4(dst_a, 1), one_3rd, + svget4(dst_d, 1)), + lerp1d_vector(pg, two_3rd, svget4(dst_a, 2), one_3rd, + svget4(dst_d, 2)), + lerp1d_vector(pg, two_3rd, svget4(dst_a, 3), one_3rd, + svget4(dst_d, 3)))); svst4_f32(pg, dst_row2 + dst_x, - (svcreate4(lerp2d_vector(pg, 0.328125F, a, 0.046875F, b, - 0.546875F, c, 0.078125F, d), - lerp2d_vector(pg, 0.234375F, a, 0.140625F, b, - 0.390625F, c, 0.234375F, d), - lerp2d_vector(pg, 0.140625F, a, 0.234375F, b, - 0.234375F, c, 0.390625F, d), - lerp2d_vector(pg, 0.046875F, a, 0.328125F, b, - 0.078125F, c, 0.546875F, d)))); - svst4_f32(pg, dst_row3 + dst_x, - (svcreate4(lerp2d_vector(pg, 0.109375F, a, 0.015625F, b, - 0.765625F, c, 0.109375F, d), - lerp2d_vector(pg, 0.078125F, a, 0.046875F, b, - 0.546875F, c, 0.328125F, d), - lerp2d_vector(pg, 0.046875F, a, 0.078125F, b, - 0.328125F, c, 0.546875F, d), - lerp2d_vector(pg, 0.015625F, a, 0.109375F, b, - 0.109375F, c, 0.765625F, d)))); + svcreate4(lerp1d_vector(pg, one_3rd, svget4(dst_a, 0), two_3rd, + svget4(dst_d, 0)), + lerp1d_vector(pg, one_3rd, svget4(dst_a, 1), two_3rd, + svget4(dst_d, 1)), + lerp1d_vector(pg, one_3rd, svget4(dst_a, 2), two_3rd, + svget4(dst_d, 2)), + lerp1d_vector(pg, one_3rd, svget4(dst_a, 3), two_3rd, + svget4(dst_d, 3)))); + svst4_f32(pg, dst_row3 + dst_x, dst_d); } };