diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 2c484b167ead5102c56268c3263cb4c4cfb6aba2..ab24e096da019426c73ba27cfd620b477dc6f941 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -60,12 +60,13 @@ static void min_max_loc_u8(benchmark::State& state) { } BENCHMARK(min_max_loc_u8); -static void resize_linear_u8(benchmark::State& state) { +static void resize_linear_u8(size_t scale_x, size_t scale_y, + benchmark::State& state) { // Setup - size_t src_width = image_width / 2; - size_t src_height = image_height / 2; - size_t dst_width = src_width * 2; - size_t dst_height = src_height * 2; + size_t src_width = image_width / scale_x; + size_t src_height = image_height / scale_y; + size_t dst_width = src_width * scale_x; + size_t dst_height = src_height * scale_y; std::vector src, dst; src.resize(src_width * src_height); dst.resize(dst_width * dst_height); @@ -80,4 +81,13 @@ static void resize_linear_u8(benchmark::State& state) { (void)unused; } } -BENCHMARK(resize_linear_u8); + +static void resize_linear_2x2_u8(benchmark::State& state) { + resize_linear_u8(2, 2, state); +} +BENCHMARK(resize_linear_2x2_u8); + +static void resize_linear_4x4_u8(benchmark::State& state) { + resize_linear_u8(4, 4, state); +} +BENCHMARK(resize_linear_4x4_u8); diff --git a/intrinsiccv/src/resize/resize_linear_neon.cpp b/intrinsiccv/src/resize/resize_linear_neon.cpp index 18733e4f18c8331ac874b6d7f1199a04475242a2..6969e93781a62fceb4b4ffafc9ed289b2f5780c1 100644 --- a/intrinsiccv/src/resize/resize_linear_neon.cpp +++ b/intrinsiccv/src/resize/resize_linear_neon.cpp @@ -8,13 +8,58 @@ namespace intrinsiccv::neon { +template +uint8x8_t lerp2d_vector_p_q_q_1(uint8x8_t a, uint8x8_t b, uint8x8_t c, + uint8x8_t d) { + // b + c + uint16x8_t b_c = vaddl_u8(b, c); + + // a * p + uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); + + // a * p + (b + c) * q + uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); + + // d + bias + uint16x8_t d_bias = vaddl_u8(d, vdup_n_u8(Bias)); + + // a * p + (b + c) * q + d + bias + uint16x8_t ap_bcq_d_bias = vaddq_u16(ap_bcq, d_bias); + + // (a * p + (b + c) * q + d + bias) >> shift + uint8x8_t result = vshrn_n_u16(ap_bcq_d_bias, Shift); + return result; +} + +template +uint8x8_t lerp2d_vector_p_q_q_r(uint8x8_t a, uint8x8_t b, uint8x8_t c, + uint8x8_t d) { + // b + c + uint16x8_t b_c = vaddl_u8(b, c); + + // a * p + uint16x8_t ap = vmull_u8(a, vdup_n_u8(P)); + + // d * r + uint16x8_t dr = vmull_u8(d, vdup_n_u8(R)); + + // a * p + (b + c) * q + uint16x8_t ap_bcq = vmlaq_u16(ap, b_c, vdupq_n_u16(Q)); + + // d * r + bias + uint16x8_t dr_bias = vaddq_u16(dr, vdupq_n_u16(Bias)); + + // a * p + (b + c) * q + d * r + bias + uint16x8_t ap_bcq_dr_bias = vaddq_u16(ap_bcq, dr_bias); + + // (a * p + (b + c) * q + d * r + bias) >> shift + uint8x8_t result = vshrn_n_u16(ap_bcq_dr_bias, Shift); + return result; +} + INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_2x2_u8( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride) { - if (src_width == 0 || src_height == 0) { - return INTRINSICCV_OK; - } - size_t dst_width = src_width * 2; auto lerp1d_scalar = [](uint8_t near, uint8_t far) { @@ -45,30 +90,8 @@ INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_2x2_u8( return (near * 9 + (mid_a + mid_b) * 3 + far + 8) >> 4; }; - auto lerp2d_vector = [](uint8x8_t near, uint8x8_t mid_a, uint8x8_t mid_b, - uint8x8_t far) { - uint8x8_t nine = vdup_n_u8(9); - uint16x8_t three = vdupq_n_u16(3); - uint8x8_t eight = vdup_n_u8(8); - - // mid_a + mid_b - uint16x8_t mid = vaddl_u8(mid_a, mid_b); - - // near * 9 - uint16x8_t near9 = vmull_u8(near, nine); - - // near * 9 + (mid_a + mid_b) * 3 - uint16x8_t near9_mid3 = vmlaq_u16(near9, mid, three); - - // far + 8 - uint16x8_t far_8 = vaddl_u8(far, eight); - - // near * 9 + (mid_a + mid_b) * 3 + far + 8 - uint16x8_t near9_mid3_far_8 = vaddq_u16(near9_mid3, far_8); - - // (near * 9 + (mid_a + mid_b) * 3 + far + 8) / 16 - uint8x8_t near9_mid3_far_8_div16 = vshrn_n_u16(near9_mid3_far_8, 4); - return near9_mid3_far_8_div16; + auto lerp2d_vector = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) { + return lerp2d_vector_p_q_q_1<9, 3, 8, 4>(a, b, c, d); }; // Handle top or bottom edge @@ -164,6 +187,227 @@ INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_2x2_u8( return INTRINSICCV_OK; } +INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_4x4_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride) { + size_t dst_width = src_width * 4, dst_height = src_height * 4; + + auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, + uint8_t b) { + return (coeff_a * a + coeff_b * b + 4) >> 3; + }; + auto lerp1d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, + uint8_t coeff_b_scalar, uint8x8_t b) { + uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); + uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); + uint16x8_t four = vdupq_n_u16(4); + + // a * coeff_a + uint16x8_t a1 = vmull_u8(a, coeff_a); + + // b * coeff_b + uint16x8_t b1 = vmull_u8(b, coeff_b); + + // a * coeff_a + b * coeff_b + uint16x8_t a1_b1 = vaddq_u16(a1, b1); + + // a * coeff_a + b * coeff_b + 4 + uint16x8_t a1_b1_4 = vaddq_u16(a1_b1, four); + + // (a * coeff_a + b * coeff_b + 4) / 8 + uint8x8_t result = vshrn_n_u16(a1_b1_4, 3); + + return result; + }; + auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, + uint8_t b, uint8_t coeff_c, uint8_t c, + uint8_t coeff_d, uint8_t d) { + return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6; + }; + auto lerp2d_vector = [](uint8_t coeff_a_scalar, uint8x8_t a, + uint8_t coeff_b_scalar, uint8x8_t b, + uint8_t coeff_c_scalar, uint8x8_t c, + uint8_t coeff_d_scalar, uint8x8_t d) { + uint8x8_t coeff_a = vdup_n_u8(coeff_a_scalar); + uint8x8_t coeff_b = vdup_n_u8(coeff_b_scalar); + uint8x8_t coeff_c = vdup_n_u8(coeff_c_scalar); + uint8x8_t coeff_d = vdup_n_u8(coeff_d_scalar); + uint16x8_t thirtytwo = vdupq_n_u16(32); + + // a * coeff_a + uint16x8_t a1 = vmull_u8(a, coeff_a); + + // b * coeff_b + uint16x8_t b1 = vmull_u8(b, coeff_b); + + // c * coeff_c + uint16x8_t c1 = vmull_u8(c, coeff_c); + + // d * coeff_d + uint16x8_t d1 = vmull_u8(d, coeff_d); + + // a * coeff_a + b * coeff_b + uint16x8_t a1_b1 = vaddq_u16(a1, b1); + + // c * coeff_c + d * coeff_d + uint16x8_t c1_d1 = vaddq_u16(c1, d1); + + // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + uint16x8_t a1_b1_c1_d1 = vaddq_u16(a1_b1, c1_d1); + + // a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32 + uint16x8_t a1_b1_c1_d1_32 = vaddq_u16(a1_b1_c1_d1, thirtytwo); + + // (a * coeff_a + b * coeff_b + c * coeff_c + d * coeff_d + 32) / 64 + uint8x8_t result = vshrn_n_u16(a1_b1_c1_d1_32, 6); + return result; + }; + // Handle top or bottom edge + auto process_edge_row = [src_width, dst_width, lerp1d_scalar, lerp1d_vector]( + const uint8_t *src_row, uint8_t *dst_row) { + // Left elements + dst_row[1] = dst_row[0] = src_row[0]; + + // Right elements + dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; + + // Middle elements + size_t src_x = 0; + for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { + size_t dst_x = src_x * 4 + 2; + uint8x8_t a = vld1_u8(src_row + src_x); + uint8x8_t b = vld1_u8(src_row + src_x + 1); + uint8x8x4_t interpolated = { + lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b), + lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)}; + + vst4_u8(dst_row + dst_x, interpolated); + } + for (; src_x + 1 < src_width; ++src_x) { + size_t dst_x = src_x * 4 + 2; + const uint8_t a = src_row[src_x], b = src_row[src_x + 1]; + dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b); + dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b); + dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b); + dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b); + } + }; + + auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar, + lerp2d_vector](const uint8_t *src_row0, + const uint8_t *src_row1, uint8_t *dst_row0, + uint8_t *dst_row1, uint8_t *dst_row2, + uint8_t *dst_row3) { + auto lerp2d_vector_49_7_7_1 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, + uint8x8_t d) { + return lerp2d_vector_p_q_q_1<49, 7, 32, 6>(a, b, c, d); + }; + auto lerp2d_vector_25_15_15_9 = [](uint8x8_t a, uint8x8_t b, uint8x8_t c, + uint8x8_t d) { + return lerp2d_vector_p_q_q_r<25, 15, 9, 32, 6>(a, b, c, d); + }; + + // Left elements + const uint8_t s0l = src_row0[0], s1l = src_row1[0]; + dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); + dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); + dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); + dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); + + // Right elements + const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + const size_t dr0 = dst_width - 2; + const size_t dr1 = dst_width - 1; + dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); + dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); + dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); + dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); + + // Middle elements + size_t src_x = 0; + for (; src_x + sizeof(uint8x8_t) < src_width; src_x += sizeof(uint8x8_t)) { + size_t dst_x = src_x * 4 + 2; + + uint8x8_t a = vld1_u8(src_row0 + src_x); + uint8x8_t b = vld1_u8(src_row0 + src_x + 1); + uint8x8_t c = vld1_u8(src_row1 + src_x); + uint8x8_t d = vld1_u8(src_row1 + src_x + 1); + + vst4_u8(dst_row0 + dst_x, (uint8x8x4_t{ + lerp2d_vector_49_7_7_1(a, b, c, d), + lerp2d_vector(35, a, 21, b, 5, c, 3, d), + lerp2d_vector(21, a, 35, b, 3, c, 5, d), + lerp2d_vector_49_7_7_1(b, a, d, c), + })); + vst4_u8(dst_row1 + dst_x, (uint8x8x4_t{ + lerp2d_vector(35, a, 5, b, 21, c, 3, d), + lerp2d_vector_25_15_15_9(a, b, c, d), + lerp2d_vector_25_15_15_9(b, a, d, c), + lerp2d_vector(5, a, 35, b, 3, c, 21, d), + })); + vst4_u8(dst_row2 + dst_x, (uint8x8x4_t{ + lerp2d_vector(21, a, 3, b, 35, c, 5, d), + lerp2d_vector_25_15_15_9(c, a, d, b), + lerp2d_vector_25_15_15_9(d, b, c, a), + lerp2d_vector(3, a, 21, b, 5, c, 35, d), + })); + vst4_u8(dst_row3 + dst_x, (uint8x8x4_t{ + lerp2d_vector_49_7_7_1(c, a, d, b), + lerp2d_vector(5, a, 3, b, 35, c, 21, d), + lerp2d_vector(3, a, 5, b, 21, c, 35, d), + lerp2d_vector_49_7_7_1(d, b, c, a), + })); + } + for (; src_x + 1 < src_width; ++src_x) { + size_t dst_x = src_x * 4 + 2; + const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1], + c = src_row1[src_x], d = src_row1[src_x + 1]; + + dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d); + dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d); + dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d); + dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d); + dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d); + dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d); + dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d); + dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d); + dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d); + dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d); + dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d); + dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d); + dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d); + dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d); + dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d); + dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d); + } + }; + + // Top rows + process_edge_row(src, dst); + memcpy(dst + dst_stride, dst, dst_stride); + + // Middle rows + for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + size_t dst_y = src_y * 4 + 2; + const uint8_t *src_row0 = src + src_stride * src_y; + const uint8_t *src_row1 = src_row0 + src_stride; + uint8_t *dst_row0 = dst + dst_stride * dst_y; + uint8_t *dst_row1 = dst_row0 + dst_stride; + uint8_t *dst_row2 = dst_row1 + dst_stride; + uint8_t *dst_row3 = dst_row2 + dst_stride; + + process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); + } + + // Bottom rows + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + memcpy(dst + dst_stride * (dst_height - 1), + dst + dst_stride * (dst_height - 2), dst_stride); + + return INTRINSICCV_OK; +} + INTRINSICCV_TARGET_FN_ATTRS intrinsiccv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, @@ -173,10 +417,17 @@ intrinsiccv_error_t resize_linear_u8(const uint8_t *src, size_t src_stride, CHECK_POINTER_AND_STRIDE(dst, dst_stride); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (src_width == 0 || src_height == 0) { + return INTRINSICCV_OK; + } if (src_width * 2 == dst_width && src_height * 2 == dst_height) { return resize_2x2_u8(src, src_stride, src_width, src_height, dst, dst_stride); } + if (src_width * 4 == dst_width && src_height * 4 == dst_height) { + return resize_4x4_u8(src, src_stride, src_width, src_height, dst, + dst_stride); + } return INTRINSICCV_ERROR_NOT_IMPLEMENTED; } diff --git a/intrinsiccv/src/resize/resize_linear_sc.h b/intrinsiccv/src/resize/resize_linear_sc.h index c2d2ead6614be3b2fab3e74d75c229624513f489..f83a776a1cc9617f9a93a096fb30316ad4ba411f 100644 --- a/intrinsiccv/src/resize/resize_linear_sc.h +++ b/intrinsiccv/src/resize/resize_linear_sc.h @@ -13,10 +13,6 @@ namespace INTRINSICCV_TARGET_NAMESPACE { INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_2x2_u8_sc( const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, uint8_t *dst, size_t dst_stride) INTRINSICCV_STREAMING_COMPATIBLE { - if (src_width == 0 || src_height == 0) { - return INTRINSICCV_OK; - } - size_t dst_width = src_width * 2; size_t dst_height = src_height * 2; @@ -163,7 +159,190 @@ INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_2x2_u8_sc( // Bottom row process_edge_row(src + src_stride * (src_height - 1), - dst + dst_stride * (src_height * 2 - 1)); + dst + dst_stride * (dst_height - 1)); + + return INTRINSICCV_OK; +} + +INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_4x4_u8_sc( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride) INTRINSICCV_STREAMING_COMPATIBLE { + size_t dst_width = src_width * 4; + size_t dst_height = src_height * 4; + + auto lerp1d_scalar = + [](uint8_t p, uint8_t a, uint8_t q, uint8_t b) + INTRINSICCV_STREAMING_COMPATIBLE { return (p * a + q * b + 4) >> 3; }; + + auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b) ISC { + // bias + svuint16_t top = svdup_u16(4); + + // bias + a * p + svuint16_t bot = svmlalb(top, a, p); + top = svmlalt(top, a, p); + + // bias + a * p + b * q + bot = svmlalb(bot, b, q); + top = svmlalt(top, b, q); + + // (bias + a * p + b * q) / 8 + svuint8_t result = svshrnb(bot, 3ULL); + result = svshrnt(result, top, 3ULL); + return result; + }; + + auto lerp2d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b, + uint8_t r, svuint8_t c, uint8_t s, svuint8_t d) ISC { + // bias + svuint16_t top = svdup_u16(32); + + // bias + a * p + svuint16_t bot = svmlalb(top, a, p); + top = svmlalt(top, a, p); + + // bias + a * p + b * q + bot = svmlalb(bot, b, q); + top = svmlalt(top, b, q); + + // bias + a * p + b * q + c * r + bot = svmlalb(bot, c, r); + top = svmlalt(top, c, r); + + // bias + a * p + b * q + c * r + d * s + bot = svmlalb(bot, d, s); + top = svmlalt(top, d, s); + + // (bias + a * p + b * q + c * r + d * s) / 64 + svuint8_t result = svshrnt(svshrnb(bot, 6ULL), top, 6ULL); + return result; + }; + + // Handle top or bottom edge + auto process_edge_row = [src_width, lerp1d_vector](const uint8_t *src_row, + uint8_t *dst_row) ISC { + for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { + size_t dst_x = src_x * 4 + 2; + svbool_t pg = svwhilelt_b8(src_x + 1, src_width); + svuint8_t a = svld1_u8(pg, src_row + src_x); + svuint8_t b = svld1_u8(pg, src_row + src_x + 1); + svst4_u8(pg, dst_row + dst_x, + svcreate4(lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b), + lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b))); + } + }; + + auto process_row = [src_width, lerp2d_vector]( + const uint8_t *src_row0, const uint8_t *src_row1, + uint8_t *dst_row0, uint8_t *dst_row1, + uint8_t *dst_row2, + uint8_t *dst_row3) INTRINSICCV_STREAMING_COMPATIBLE { + // Middle elements + for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) { + size_t dst_x = src_x * 4 + 2; + + svbool_t pg = svwhilelt_b8(src_x + 1, src_width); + + svuint8_t a = svld1_u8(pg, src_row0 + src_x); + svuint8_t b = svld1_u8(pg, src_row0 + src_x + 1); + svuint8_t c = svld1_u8(pg, src_row1 + src_x); + svuint8_t d = svld1_u8(pg, src_row1 + src_x + 1); + + svst4_u8(pg, dst_row0 + dst_x, + (svcreate4(lerp2d_vector(49, a, 7, b, 7, c, 1, d), + lerp2d_vector(35, a, 21, b, 5, c, 3, d), + lerp2d_vector(21, a, 35, b, 3, c, 5, d), + lerp2d_vector(49, b, 7, a, 7, d, 1, c)))); + + svst4_u8(pg, dst_row1 + dst_x, + (svcreate4(lerp2d_vector(35, a, 5, b, 21, c, 3, d), + lerp2d_vector(25, a, 15, b, 15, c, 9, d), + lerp2d_vector(15, a, 25, b, 9, c, 15, d), + lerp2d_vector(5, a, 35, b, 3, c, 21, d)))); + svst4_u8(pg, dst_row2 + dst_x, + (svcreate4(lerp2d_vector(21, a, 3, b, 35, c, 5, d), + lerp2d_vector(15, a, 9, b, 25, c, 15, d), + lerp2d_vector(9, a, 15, b, 15, c, 25, d), + lerp2d_vector(3, a, 21, b, 5, c, 35, d)))); + svst4_u8(pg, dst_row3 + dst_x, + (svcreate4(lerp2d_vector(49, c, 7, a, 7, d, 1, b), + lerp2d_vector(5, a, 3, b, 35, c, 21, d), + lerp2d_vector(3, a, 5, b, 21, c, 35, d), + lerp2d_vector(49, d, 7, b, 7, c, 1, a)))); + } + }; + + // Corners + auto set_corner = [dst, dst_stride](size_t left_column, size_t top_row, + uint8_t value) ISC { + dst[dst_stride * top_row + left_column] = value; + dst[dst_stride * top_row + left_column + 1] = value; + dst[dst_stride * (top_row + 1) + left_column] = value; + dst[dst_stride * (top_row + 1) + left_column + 1] = value; + }; + set_corner(0, 0, src[0]); + set_corner(dst_width - 2, 0, src[src_width - 1]); + set_corner(0, dst_height - 2, src[src_stride * (src_height - 1)]); + set_corner(dst_width - 2, dst_height - 2, + src[src_stride * (src_height - 1) + src_width - 1]); + + // Left & right edge + for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + size_t dst_y = src_y * 4 + 2; + const uint8_t *src_row0 = src + src_stride * src_y; + const uint8_t *src_row1 = src_row0 + src_stride; + uint8_t *dst_row0 = dst + dst_stride * dst_y; + uint8_t *dst_row1 = dst_row0 + dst_stride; + uint8_t *dst_row2 = dst_row1 + dst_stride; + uint8_t *dst_row3 = dst_row2 + dst_stride; + + // Left elements + const uint8_t s0l = src_row0[0], s1l = src_row1[0]; + dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); + dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); + dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); + dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); + + // Right elements + const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + const size_t dr0 = dst_width - 2; + const size_t dr1 = dst_width - 1; + dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); + dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); + dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); + dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); + } + + auto copy_dst_row = [src_width](const uint8_t *dst_from, + uint8_t *dst_to) ISC { + for (size_t i = 0; i < src_width; i += svcntb()) { + svbool_t pg = svwhilelt_b8(i, src_width); + svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4)); + } + }; + + // Top row + process_edge_row(src, dst); + copy_dst_row(dst, dst + dst_stride); + + // Middle rows + for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + size_t dst_y = src_y * 4 + 2; + const uint8_t *src_row0 = src + src_stride * src_y; + const uint8_t *src_row1 = src_row0 + src_stride; + uint8_t *dst_row0 = dst + dst_stride * dst_y; + uint8_t *dst_row1 = dst_row0 + dst_stride; + uint8_t *dst_row2 = dst_row1 + dst_stride; + uint8_t *dst_row3 = dst_row2 + dst_stride; + + process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); + } + + // Bottom row + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + copy_dst_row(dst + dst_stride * (dst_height - 2), + dst + dst_stride * (dst_height - 1)); return INTRINSICCV_OK; } @@ -176,10 +355,17 @@ INTRINSICCV_TARGET_FN_ATTRS static intrinsiccv_error_t resize_linear_u8_sc( CHECK_POINTER_AND_STRIDE(dst, dst_stride); CHECK_IMAGE_SIZE(dst_width, dst_height); + if (src_width == 0 || src_height == 0) { + return INTRINSICCV_OK; + } if (src_width * 2 == dst_width && src_height * 2 == dst_height) { return resize_2x2_u8_sc(src, src_stride, src_width, src_height, dst, dst_stride); } + if (src_width * 4 == dst_width && src_height * 4 == dst_height) { + return resize_4x4_u8_sc(src, src_stride, src_width, src_height, dst, + dst_stride); + } return INTRINSICCV_ERROR_NOT_IMPLEMENTED; } diff --git a/test/api/test_resize_linear.cpp b/test/api/test_resize_linear.cpp index 9446577d7230cba702b84d0e6d8731b207e0935c..a011a259b307f03f779301ecebe3da69024d6aab 100644 --- a/test/api/test_resize_linear.cpp +++ b/test/api/test_resize_linear.cpp @@ -55,10 +55,9 @@ TEST(ResizeLinear, ZeroImageSize) { intrinsiccv_resize_linear_u8(src, 0, 0, 1, dst, 0, 0, 2)); } -static void resize_linear_unaccelerated(const uint8_t *src, size_t src_stride, - size_t src_width, size_t src_height, - uint8_t *dst, size_t dst_stride, - size_t dst_width, size_t dst_height) { +static void resize_linear_unaccelerated_2x2_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height) { auto lerp1d = [](uint8_t near, uint8_t far) { return (near * 3 + far + 2) >> 2; }; @@ -111,23 +110,140 @@ static void resize_linear_unaccelerated(const uint8_t *src, size_t src_stride, process_row(last_src_row, last_src_row, last_dst_row, last_dst_row); } -TEST(ResizeLinear, LargeDimensions) { +static void resize_linear_unaccelerated_4x4_u8( + const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, + uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height) { + auto lerp1d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, + uint8_t b) { + return (coeff_a * a + coeff_b * b + 4) >> 3; + }; + auto lerp2d_scalar = [](uint8_t coeff_a, uint8_t a, uint8_t coeff_b, + uint8_t b, uint8_t coeff_c, uint8_t c, + uint8_t coeff_d, uint8_t d) { + return (coeff_a * a + coeff_b * b + coeff_c * c + coeff_d * d + 32) >> 6; + }; + // Handle top or bottom edge + auto process_edge_row = [src_width, dst_width, + lerp1d_scalar /*, lerp1d_vector*/]( + const uint8_t *src_row, uint8_t *dst_row) { + // Left elements + dst_row[1] = dst_row[0] = src_row[0]; + + // Right elements + dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1]; + + // Middle elements + for (size_t src_x = 0; src_x + 1 < src_width; ++src_x) { + size_t dst_x = src_x * 4 + 2; + const uint8_t a = src_row[src_x], b = src_row[src_x + 1]; + dst_row[dst_x + 0] = lerp1d_scalar(7, a, 1, b); + dst_row[dst_x + 1] = lerp1d_scalar(5, a, 3, b); + dst_row[dst_x + 2] = lerp1d_scalar(3, a, 5, b); + dst_row[dst_x + 3] = lerp1d_scalar(1, a, 7, b); + } + }; + + auto process_row = [src_width, dst_width, lerp1d_scalar, lerp2d_scalar]( + const uint8_t *src_row0, const uint8_t *src_row1, + uint8_t *dst_row0, uint8_t *dst_row1, + uint8_t *dst_row2, uint8_t *dst_row3) { + // Left elements + const uint8_t s0l = src_row0[0], s1l = src_row1[0]; + dst_row0[0] = dst_row0[1] = lerp1d_scalar(7, s0l, 1, s1l); + dst_row1[0] = dst_row1[1] = lerp1d_scalar(5, s0l, 3, s1l); + dst_row2[0] = dst_row2[1] = lerp1d_scalar(3, s0l, 5, s1l); + dst_row3[0] = dst_row3[1] = lerp1d_scalar(1, s0l, 7, s1l); + + // Right elements + const size_t s0r = src_row0[src_width - 1], s1r = src_row1[src_width - 1]; + const size_t dr0 = dst_width - 2; + const size_t dr1 = dst_width - 1; + dst_row0[dr0] = dst_row0[dr1] = lerp1d_scalar(7, s0r, 1, s1r); + dst_row1[dr0] = dst_row1[dr1] = lerp1d_scalar(5, s0r, 3, s1r); + dst_row2[dr0] = dst_row2[dr1] = lerp1d_scalar(3, s0r, 5, s1r); + dst_row3[dr0] = dst_row3[dr1] = lerp1d_scalar(1, s0r, 7, s1r); + + // Middle elements + for (size_t src_x = 0; src_x + 1 < src_width; ++src_x) { + size_t dst_x = src_x * 4 + 2; + const uint8_t a = src_row0[src_x], b = src_row0[src_x + 1], + c = src_row1[src_x], d = src_row1[src_x + 1]; + + dst_row0[dst_x + 0] = lerp2d_scalar(49, a, 7, b, 7, c, 1, d); + dst_row0[dst_x + 1] = lerp2d_scalar(35, a, 21, b, 5, c, 3, d); + dst_row0[dst_x + 2] = lerp2d_scalar(21, a, 35, b, 3, c, 5, d); + dst_row0[dst_x + 3] = lerp2d_scalar(7, a, 49, b, 1, c, 7, d); + dst_row1[dst_x + 0] = lerp2d_scalar(35, a, 5, b, 21, c, 3, d); + dst_row1[dst_x + 1] = lerp2d_scalar(25, a, 15, b, 15, c, 9, d); + dst_row1[dst_x + 2] = lerp2d_scalar(15, a, 25, b, 9, c, 15, d); + dst_row1[dst_x + 3] = lerp2d_scalar(5, a, 35, b, 3, c, 21, d); + dst_row2[dst_x + 0] = lerp2d_scalar(21, a, 3, b, 35, c, 5, d); + dst_row2[dst_x + 1] = lerp2d_scalar(15, a, 9, b, 25, c, 15, d); + dst_row2[dst_x + 2] = lerp2d_scalar(9, a, 15, b, 15, c, 25, d); + dst_row2[dst_x + 3] = lerp2d_scalar(3, a, 21, b, 5, c, 35, d); + dst_row3[dst_x + 0] = lerp2d_scalar(7, a, 1, b, 49, c, 7, d); + dst_row3[dst_x + 1] = lerp2d_scalar(5, a, 3, b, 35, c, 21, d); + dst_row3[dst_x + 2] = lerp2d_scalar(3, a, 5, b, 21, c, 35, d); + dst_row3[dst_x + 3] = lerp2d_scalar(1, a, 7, b, 7, c, 49, d); + } + }; + + // Top rows + process_edge_row(src, dst); + memcpy(dst + dst_stride, dst, dst_stride); + + // Middle rows + for (size_t src_y = 0; src_y + 1 < src_height; ++src_y) { + size_t dst_y = src_y * 4 + 2; + const uint8_t *src_row0 = src + src_stride * src_y; + const uint8_t *src_row1 = src_row0 + src_stride; + uint8_t *dst_row0 = dst + dst_stride * dst_y; + uint8_t *dst_row1 = dst_row0 + dst_stride; + uint8_t *dst_row2 = dst_row1 + dst_stride; + uint8_t *dst_row3 = dst_row2 + dst_stride; + + process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3); + } + + // Bottom rows + process_edge_row(src + src_stride * (src_height - 1), + dst + dst_stride * (dst_height - 2)); + memcpy(dst + dst_stride * (dst_height - 1), + dst + dst_stride * (dst_height - 2), dst_stride); +} + +static void resize_linear_unaccelerated_u8(const uint8_t *src, + size_t src_stride, size_t src_width, + size_t src_height, uint8_t *dst, + size_t dst_stride, size_t dst_width, + size_t dst_height) { + if (src_width * 2 == dst_width && src_height * 2 == dst_height) { + resize_linear_unaccelerated_2x2_u8(src, src_stride, src_width, src_height, + dst, dst_stride, dst_width, dst_height); + } + if (src_width * 4 == dst_width && src_height * 4 == dst_height) { + resize_linear_unaccelerated_4x4_u8(src, src_stride, src_width, src_height, + dst, dst_stride, dst_width, dst_height); + } +} + +static void do_large_dimensions_test(size_t x_scale, size_t y_scale) { size_t src_width = 2049; size_t src_height = 5; size_t src_stride = src_width + 6; - size_t dst_width = src_width * 2; - size_t dst_height = src_height * 2; + size_t dst_width = src_width * x_scale; + size_t dst_height = src_height * y_scale; size_t dst_stride = dst_width + 3; std::vector src, dst, expected_data; src.resize(src_stride * src_height); dst.resize(dst_stride * dst_height); expected_data.resize(dst_stride * dst_height); - std::mt19937 generator{test::Options::seed()}; + std::mt19937 generator{static_cast(test::Options::seed())}; std::generate(src.begin(), src.end(), generator); - resize_linear_unaccelerated(src.data(), src_stride, src_width, src_height, - expected_data.data(), dst_stride, dst_width, - dst_height); + resize_linear_unaccelerated_u8(src.data(), src_stride, src_width, src_height, + expected_data.data(), dst_stride, dst_width, + dst_height); ASSERT_EQ(INTRINSICCV_OK, intrinsiccv_resize_linear_u8( src.data(), src_stride, src_width, src_height, dst.data(), dst_stride, dst_width, dst_height)); @@ -144,6 +260,10 @@ TEST(ResizeLinear, LargeDimensions) { } } +TEST(ResizeLinear, LargeDimensions2x2) { do_large_dimensions_test(2, 2); } + +TEST(ResizeLinear, LargeDimensions4x4) { do_large_dimensions_test(4, 4); } + // Parameterised tests struct ResizeTestParams { std::vector> src; @@ -284,4 +404,109 @@ INSTANTIATE_TEST_SUITE_P( 81, 59, 90, 127, 170, 170, 127, 98, 81, 92, 129, 114, 47, 50, 124, 150, 129, 94, 45, 17, 8, 11, 26, 36, 39, 68, 123, 143, 128, 107, 81, 70, 73, 60, 29, 18, - 27, 54, 101, 148, 197, 219, 216, 197, 163, 146}}})); + 27, 54, 101, 148, 197, 219, 216, 197, 163, 146}}}, + // 2*2 -> 8*8 + P{{{0, 255}, {128, 124}}, + {{0, 0, 32, 96, 159, 223, 255, 255}, + {0, 0, 32, 96, 159, 223, 255, 255}, + {16, 16, 44, 99, 155, 211, 239, 239}, + {48, 48, 68, 107, 147, 186, 206, 206}, + {80, 80, 92, 115, 138, 161, 173, 173}, + {112, 112, 116, 123, 130, 137, 140, 140}, + {128, 128, 128, 127, 126, 125, 124, 124}, + {128, 128, 128, 127, 126, 125, 124, 124}}}, + // 35*2 -> 140*8 + P{{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 82, + 155, 104, 108, 227, 46, 162, 21, 220, 235, 183, 113, 225, + 146, 196, 144, 104, 148, 19, 126, 172, 9, 12, 61}, + {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 193, + 44, 105, 191, 106, 73, 148, 13, 161, 118, 21, 3, 34, + 40, 150, 120, 68, 75, 14, 31, 124, 221, 214, 146}}, + {{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, + 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, + 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, + 9, 10, 10, 19, 37, 55, 73, 91, 109, 128, 146, 149, 136, + 123, 110, 105, 106, 107, 108, 123, 153, 182, 212, 204, 159, 114, + 69, 61, 90, 119, 148, 144, 109, 74, 39, 46, 96, 145, 195, + 222, 226, 229, 233, 229, 216, 203, 190, 174, 157, 139, 122, 127, + 155, 183, 211, 215, 195, 176, 156, 152, 165, 177, 190, 190, 177, + 164, 151, 139, 129, 119, 109, 110, 121, 132, 143, 132, 100, 67, + 35, 32, 59, 86, 113, 132, 143, 155, 166, 152, 111, 70, 29, + 9, 10, 11, 12, 18, 30, 43, 55, 61, 61}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, + 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, + 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, + 9, 10, 10, 19, 37, 55, 73, 91, 109, 128, 146, 149, 136, + 123, 110, 105, 106, 107, 108, 123, 153, 182, 212, 204, 159, 114, + 69, 61, 90, 119, 148, 144, 109, 74, 39, 46, 96, 145, 195, + 222, 226, 229, 233, 229, 216, 203, 190, 174, 157, 139, 122, 127, + 155, 183, 211, 215, 195, 176, 156, 152, 165, 177, 190, 190, 177, + 164, 151, 139, 129, 119, 109, 110, 121, 132, 143, 132, 100, 67, + 35, 32, 59, 86, 113, 132, 143, 155, 166, 152, 111, 70, 29, + 9, 10, 11, 12, 18, 30, 43, 55, 61, 61}, + {1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, + 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, + 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, + 10, 10, 10, 21, 43, 64, 85, 102, 113, 124, 135, 137, 127, + 118, 109, 106, 109, 113, 117, 130, 153, 177, 200, 192, 151, 110, + 70, 63, 91, 119, 146, 143, 108, 73, 38, 44, 92, 140, 189, + 214, 216, 217, 219, 213, 199, 184, 170, 155, 139, 123, 107, 112, + 137, 163, 188, 193, 175, 158, 141, 140, 154, 169, 183, 184, 172, + 159, 147, 136, 125, 115, 105, 104, 114, 124, 134, 124, 94, 64, + 33, 30, 54, 78, 102, 121, 134, 147, 160, 150, 117, 84, 52, + 36, 36, 37, 37, 42, 50, 59, 67, 72, 72}, + {2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, + 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, + 11, 11, 11, 26, 54, 82, 110, 122, 120, 117, 115, 112, 110, + 108, 106, 109, 117, 126, 135, 144, 155, 166, 176, 166, 135, 103, + 72, 69, 94, 119, 144, 139, 105, 70, 35, 40, 85, 130, 175, + 197, 195, 194, 192, 183, 165, 148, 131, 116, 103, 91, 78, 82, + 102, 123, 143, 147, 136, 124, 112, 115, 133, 152, 170, 173, 162, + 151, 140, 129, 118, 107, 96, 94, 102, 109, 117, 108, 82, 56, + 30, 26, 45, 63, 81, 98, 114, 130, 146, 146, 129, 113, 97, + 88, 88, 88, 88, 88, 90, 91, 92, 93, 93}, + {3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, + 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, + 12, 12, 12, 30, 65, 99, 134, 143, 127, 110, 94, 88, 93, + 98, 102, 112, 125, 139, 153, 159, 157, 155, 152, 140, 118, 96, + 74, 74, 97, 119, 142, 136, 102, 67, 33, 37, 79, 120, 162, + 180, 175, 170, 165, 152, 132, 112, 92, 77, 68, 58, 49, 52, + 67, 83, 98, 102, 96, 89, 83, 91, 113, 134, 156, 162, 153, + 143, 134, 123, 111, 99, 87, 84, 89, 95, 100, 92, 70, 48, + 27, 22, 35, 48, 60, 76, 95, 114, 133, 142, 142, 142, 142, + 141, 140, 139, 139, 135, 129, 123, 117, 114, 114}, + {4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, + 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, + 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, + 13, 13, 13, 34, 76, 117, 158, 164, 134, 103, 73, 64, 76, + 87, 99, 114, 133, 152, 171, 173, 158, 143, 129, 115, 102, 89, + 76, 80, 100, 120, 140, 133, 99, 65, 31, 33, 72, 110, 149, + 164, 155, 146, 137, 121, 98, 76, 53, 38, 32, 26, 20, 22, + 32, 42, 53, 57, 56, 55, 54, 66, 92, 117, 143, 152, 143, + 135, 127, 117, 104, 91, 79, 74, 77, 80, 83, 75, 58, 41, + 23, 18, 25, 32, 39, 54, 76, 97, 119, 138, 154, 170, 186, + 194, 192, 191, 189, 182, 169, 155, 142, 135, 135}, + {4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, + 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, + 13, 14, 14, 36, 81, 126, 171, 174, 137, 100, 63, 52, 67, + 82, 97, 116, 137, 159, 180, 180, 159, 138, 117, 102, 94, 85, + 77, 82, 101, 120, 139, 131, 97, 64, 30, 32, 69, 106, 143, + 156, 145, 134, 123, 106, 82, 57, 33, 19, 14, 10, 5, 7, + 15, 22, 30, 35, 36, 38, 39, 54, 81, 109, 136, 146, 139, + 131, 124, 114, 101, 88, 75, 69, 71, 72, 74, 67, 52, 37, + 22, 16, 20, 25, 29, 43, 66, 89, 112, 136, 160, 185, 209, + 220, 218, 217, 215, 206, 189, 172, 155, 146, 146}, + {4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, + 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, + 13, 14, 14, 36, 81, 126, 171, 174, 137, 100, 63, 52, 67, + 82, 97, 116, 137, 159, 180, 180, 159, 138, 117, 102, 94, 85, + 77, 82, 101, 120, 139, 131, 97, 64, 30, 32, 69, 106, 143, + 156, 145, 134, 123, 106, 82, 57, 33, 19, 14, 10, 5, 7, + 15, 22, 30, 35, 36, 38, 39, 54, 81, 109, 136, 146, 139, + 131, 124, 114, 101, 88, 75, 69, 71, 72, 74, 67, 52, 37, + 22, 16, 20, 25, 29, 43, 66, 89, 112, 136, 160, 185, 209, + 220, 218, 217, 215, 206, 189, 172, 155, 146, 146}}}));