diff --git a/adapters/opencv/kleidicv_hal.cpp b/adapters/opencv/kleidicv_hal.cpp index e73c350fa55a5e466eae00387a9d71be9b0e671c..6c9dbdae7a75a885e6da4f7e022dfde3e751a3fd 100644 --- a/adapters/opencv/kleidicv_hal.cpp +++ b/adapters/opencv/kleidicv_hal.cpp @@ -1363,6 +1363,15 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, static_cast(dst_width), static_cast(dst_height), 1, mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, border_value.data(), mt)); + } else if (src_type == CV_8UC4) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_s16point5_u8( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 4, + mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, + border_value.data(), mt)); } else if (src_type == CV_16UC1) { auto border_value = get_border_value(border_value_f64); return convert_error(kleidicv_thread_remap_s16point5_u16( @@ -1372,6 +1381,15 @@ int remap_s16point5(int src_type, const uchar *src_data, size_t src_step, static_cast(dst_width), static_cast(dst_height), 1, mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, border_value.data(), mt)); + } else if (src_type == CV_16UC4) { + auto border_value = get_border_value(border_value_f64); + return convert_error(kleidicv_thread_remap_s16point5_u16( + reinterpret_cast(src_data), src_step, + static_cast(src_width), static_cast(src_height), + reinterpret_cast(dst_data), dst_step, + static_cast(dst_width), static_cast(dst_height), 4, + mapxy, mapxy_step, mapfrac, mapfrac_step, kleidicv_border_type, + border_value.data(), mt)); } return CV_HAL_ERROR_NOT_IMPLEMENTED; diff --git a/conformity/opencv/test_remap.cpp b/conformity/opencv/test_remap.cpp index face699ec1f7f0080db7d6bda130a782bbad2808..c7bc4f57dfbb2a34ca9eac132fd4a67154057b2b 100644 --- a/conformity/opencv/test_remap.cpp +++ b/conformity/opencv/test_remap.cpp @@ -250,9 +250,13 @@ std::vector& remap_tests_get() { TEST("RemapS16 uint16 Constant", (test_remap_s16), (exec_remap_s16)), TEST("RemapS16Point5 uint8 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint8 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Replicate", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Replicate 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint8 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint8 Constant 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapS16Point5 uint16 Constant", (test_remap_s16point5), (exec_remap_s16point5)), + TEST("RemapS16Point5 uint16 Constant 4ch", (test_remap_s16point5), (exec_remap_s16point5)), TEST("RemapF32 uint8 Replicate Linear", (test_remap_f32), (exec_remap_f32)), TEST("RemapF32 uint16 Replicate Linear", (test_remap_f32), (exec_remap_f32)), diff --git a/kleidicv/include/kleidicv/transform/remap.h b/kleidicv/include/kleidicv/transform/remap.h index dcf43bd6401eb902417ae003810a5aefbc19e506..285fc22f7bd19c6e77a249076f10a6413ea913b4 100644 --- a/kleidicv/include/kleidicv/transform/remap.h +++ b/kleidicv/include/kleidicv/transform/remap.h @@ -38,13 +38,14 @@ inline bool remap_s16point5_is_implemented( size_t channels) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value || std::is_same::value) { - return (src_stride / sizeof(T) <= std::numeric_limits::max() && + return (src_stride / sizeof(T) <= + (std::numeric_limits::max() / channels) && dst_width >= 8 && src_width <= std::numeric_limits::max() + 1 && src_height <= std::numeric_limits::max() + 1 && (border_type == KLEIDICV_BORDER_TYPE_REPLICATE || border_type == KLEIDICV_BORDER_TYPE_CONSTANT) && - channels == 1); + (channels == 1 || channels == 4)); } else { return false; } diff --git a/kleidicv/src/transform/remap_s16point5_neon.cpp b/kleidicv/src/transform/remap_s16point5_neon.cpp index 27893a0a34f4f6fb882a77192314b1a8718055dd..568e13f9b3ee5fee3bf97e93be68dcc663f717f4 100644 --- a/kleidicv/src/transform/remap_s16point5_neon.cpp +++ b/kleidicv/src/transform/remap_s16point5_neon.cpp @@ -310,6 +310,43 @@ class RemapS16Point5Replicate { int16x8_t y1_; }; // end of class RemapS16Point5Replicate +inline uint8x8_t interpolate_u8(uint8x8_t v00_narrow, uint8x8_t v01_narrow, + uint8x8_t v10_narrow, uint8x8_t v11_narrow, + uint16x8_t xfrac, uint16x8_t yfrac, + uint16x8_t nxfrac, uint16x8_t nyfrac) { + uint16x8_t v00 = vmovl_u8(v00_narrow); + uint16x8_t v01 = vmovl_u8(v01_narrow); + uint16x8_t v10 = vmovl_u8(v10_narrow); + uint16x8_t v11 = vmovl_u8(v11_narrow); + + auto interpolate_horizontal = [&](uint16x8_t left, uint16x8_t right) { + return vmlaq_u16(vmulq_u16(nxfrac, left), xfrac, right); + }; + + // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint16x4_t top, uint16x4_t bottom, + uint16x4_t frac, uint16x4_t nfrac) { + uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, top, nfrac), bottom, frac); + return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint16x8_t line0 = interpolate_horizontal(v00, v10); + uint16x8_t line1 = interpolate_horizontal(v01, v11); + + uint16x4_t lo = + interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1), + vget_low_u16(yfrac), vget_low_u16(nyfrac)); + uint16x4_t hi = + interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1), + vget_high_u16(yfrac), vget_high_u16(nyfrac)); + + // Discard upper 8 bits of each element and combine low and high parts into + // a single register. + return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi)); +} + template class RemapS16Point5ConstantBorder; @@ -333,13 +370,13 @@ class RemapS16Point5ConstantBorder { auto vector_path = [&](size_t step) { int16x8x2_t xy = vld2q_s16(&mapxy[0]); uint16x8_t frac = vld1q_u16(&mapfrac[0]); - uint8x8_t frac_max = vdup_n_u8(REMAP16POINT5_FRAC_MAX); - uint8x8_t frac_mask = vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1); - uint8x8_t xfrac = vand_u8(vmovn_u16(frac), frac_mask); - uint8x8_t yfrac = - vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); - uint8x8_t nxfrac = vsub_u8(frac_max, xfrac); - uint8x8_t nyfrac = vsub_u8(frac_max, yfrac); + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + uint16x8_t frac_mask = vdupq_n_u16(REMAP16POINT5_FRAC_MAX - 1); + uint16x8_t xfrac = vandq_u16(frac, frac_mask); + uint16x8_t yfrac = + vandq_u16(vshrq_n_u16(frac, REMAP16POINT5_FRAC_BITS), frac_mask); + uint16x8_t nxfrac = vsubq_u16(frac_max, xfrac); + uint16x8_t nyfrac = vsubq_u16(frac_max, yfrac); uint16x8_t one = vdupq_n_u16(1); uint16x8_t x0 = vreinterpretq_u16_s16(xy.val[0]); @@ -356,8 +393,8 @@ class RemapS16Point5ConstantBorder { uint8x8_t v11 = load_pixels_or_constant_border( src_rows_, v_src_stride_, v_width_, v_height_, v_border_, x1, y1); - uint8x8_t result = interpolate(v00, v01, v10, v11, xfrac, vmovl_u8(yfrac), - nxfrac, vmovl_u8(nyfrac)); + uint8x8_t result = + interpolate_u8(v00, v01, v10, v11, xfrac, yfrac, nxfrac, nyfrac); vst1_u8(&dst[0], result); mapxy += ptrdiff_t(step); @@ -414,37 +451,6 @@ class RemapS16Point5ConstantBorder { return vbsl_u8(vmovn_u16(in_range), pixels, v_border_); } - uint8x8_t interpolate(uint8x8_t v00, uint8x8_t v01, uint8x8_t v10, - uint8x8_t v11, uint8x8_t xfrac, uint16x8_t yfrac, - uint8x8_t nxfrac, uint16x8_t nyfrac) { - auto interpolate_horizontal = [&](uint8x8_t left, uint8x8_t right) { - return vmlal_u8(vmull_u8(nxfrac, left), xfrac, right); - }; - - // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. - const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); - - auto interpolate_vertical = [&](uint16x4_t a, uint16x4_t b, uint16x4_t frac, - uint16x4_t nfrac) { - uint32x4_t res32 = vmlal_u16(vmlal_u16(bias, a, nfrac), b, frac); - return vshrn_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); - }; - - uint16x8_t line0 = interpolate_horizontal(v00, v10); - uint16x8_t line1 = interpolate_horizontal(v01, v11); - - uint16x4_t lo = - interpolate_vertical(vget_low_u16(line0), vget_low_u16(line1), - vget_low_u16(yfrac), vget_low_u16(nyfrac)); - uint16x4_t hi = - interpolate_vertical(vget_high_u16(line0), vget_high_u16(line1), - vget_high_u16(yfrac), vget_high_u16(nyfrac)); - - // Discard upper 8 bits of each element and combine low and high parts into - // a single register. - return vuzp1_u8(vreinterpret_u8_u16(lo), vreinterpret_u8_u16(hi)); - } - Rows src_rows_; uint16x8_t v_src_stride_; uint16x8_t v_width_; @@ -625,6 +631,664 @@ class RemapS16Point5ConstantBorder { int16x8_t y1_; }; // end of class RemapS16Point5ConstantBorder +inline void get_coordinates(Columns mapxy, + Columns mapfrac, uint16x8_t &x, + uint16x8_t &y, uint8x8_t &xfrac, uint8x8_t &yfrac) { + int16x8x2_t xy = vld2q_s16(&mapxy[0]); + x = xy.val[0]; + y = xy.val[1]; + + uint16x8_t frac = vld1q_u16(&mapfrac[0]); + xfrac = vand_u8(vmovn_u16(frac), vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1)); + yfrac = vand_u8(vshrn_n_u16(frac, REMAP16POINT5_FRAC_BITS), + vdup_n_u8(REMAP16POINT5_FRAC_MAX - 1)); +} + +inline void get_offsets_4ch(uint16x4_t x0, uint16x4_t y0, uint16x4_t x1, + uint16x4_t y1, uint32x4_t &offsets_a, + uint32x4_t &offsets_b, uint32x4_t &offsets_c, + uint32x4_t &offsets_d, + uint16x4_t v_src_element_stride) { + // Multiply by 4 because of channels + uint32x4_t x0_scaled = vshll_n_u16(x0, 2); + uint32x4_t x1_scaled = vshll_n_u16(x1, 2); + + // Calculate offsets from coordinates (y * element_stride + x) + // a: top left, b: top right, c: bottom left, d: bottom right + offsets_a = vmlal_u16(x0_scaled, y0, v_src_element_stride); + offsets_b = vmlal_u16(x1_scaled, y0, v_src_element_stride); + offsets_c = vmlal_u16(x0_scaled, y1, v_src_element_stride); + offsets_d = vmlal_u16(x1_scaled, y1, v_src_element_stride); +} + +inline uint16x8_t create_frac_low_high_u8_4ch(uint8_t frac_low, + uint8_t frac_high) { + uint8x8_t frac_low_high = {frac_low, frac_low, frac_low, frac_low, + frac_high, frac_high, frac_high, frac_high}; + return vmovl_u8(frac_low_high); +} + +inline uint8x16_t interpolate_u8_4ch( + uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d, + uint8_t xfrac_pixel_0, uint8_t yfrac_pixel_0, uint8_t xfrac_pixel_1, + uint8_t yfrac_pixel_1, uint8_t xfrac_pixel_2, uint8_t yfrac_pixel_2, + uint8_t xfrac_pixel_3, uint8_t yfrac_pixel_3) { + uint16x8_t frac_max = vdupq_n_u16(REMAP16POINT5_FRAC_MAX); + + uint16x8_t xfrac_pixels_01 = + create_frac_low_high_u8_4ch(xfrac_pixel_0, xfrac_pixel_1); + uint16x8_t yfrac_pixels_01 = + create_frac_low_high_u8_4ch(yfrac_pixel_0, yfrac_pixel_1); + uint16x8_t nxfrac_pixels_01 = vsubq_u16(frac_max, xfrac_pixels_01); + uint16x8_t nyfrac_pixels_01 = vsubq_u16(frac_max, yfrac_pixels_01); + + uint16x8_t xfrac_pixels_23 = + create_frac_low_high_u8_4ch(xfrac_pixel_2, xfrac_pixel_3); + uint16x8_t yfrac_pixels_23 = + create_frac_low_high_u8_4ch(yfrac_pixel_2, yfrac_pixel_3); + uint16x8_t nxfrac_pixels_23 = vsubq_u16(frac_max, xfrac_pixels_23); + uint16x8_t nyfrac_pixels_23 = vsubq_u16(frac_max, yfrac_pixels_23); + + uint8x8x2_t res; + res.val[0] = interpolate_u8(vget_low_u8(a), vget_low_u8(c), vget_low_u8(b), + vget_low_u8(d), xfrac_pixels_01, yfrac_pixels_01, + nxfrac_pixels_01, nyfrac_pixels_01); + res.val[1] = interpolate_u8(vget_high_u8(a), vget_high_u8(c), vget_high_u8(b), + vget_high_u8(d), xfrac_pixels_23, yfrac_pixels_23, + nxfrac_pixels_23, nyfrac_pixels_23); + + return vcombine(res.val[0], res.val[1]); +} + +inline uint16x8_t interpolate_u16_4ch(uint16x8_t v00, uint16x8_t v01, + uint16x8_t v10, uint16x8_t v11, + uint16_t xfrac_pixel_0, + uint16_t yfrac_pixel_0, + uint16_t xfrac_pixel_1, + uint16_t yfrac_pixel_1) { + auto interpolate_horizontal = [&](uint16x4_t left, uint16x4_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return vmlal_u16(vmull_u16(nfrac, left), frac, right); + }; + + auto interpolate_horizontal_low = [&](uint16x8_t left, uint16x8_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_low_u16(left), vget_low_u16(right), frac, + nfrac); + }; + + auto interpolate_horizontal_high = [&](uint16x8_t left, uint16x8_t right, + uint16x4_t frac, + uint16x4_t nfrac) -> uint32x4_t { + return interpolate_horizontal(vget_high_u16(left), vget_high_u16(right), + frac, nfrac); + }; + + // Offset pixel values from [0,255] to [0.5,255.5] before rounding down. + const uint32x4_t bias = vdupq_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + auto interpolate_vertical = [&](uint32x4_t line0, uint32x4_t line1, + uint32x4_t frac, + uint32x4_t nfrac) -> uint32x4_t { + uint32x4_t res32 = vmlaq_u32(vmlaq_u32(bias, line0, nfrac), line1, frac); + return vshrq_n_u32(res32, 2 * REMAP16POINT5_FRAC_BITS); + }; + + uint16x4_t v_xfrac0 = vdup_n_u16(xfrac_pixel_0); + uint16x4_t v_nxfrac0 = vdup_n_u16(REMAP16POINT5_FRAC_MAX - xfrac_pixel_0); + uint16x4_t v_xfrac1 = vdup_n_u16(xfrac_pixel_1); + uint16x4_t v_nxfrac1 = vdup_n_u16(REMAP16POINT5_FRAC_MAX - xfrac_pixel_1); + + uint32x4_t line0_low = + interpolate_horizontal_low(v00, v10, v_xfrac0, v_nxfrac0); + uint32x4_t line1_low = + interpolate_horizontal_low(v01, v11, v_xfrac0, v_nxfrac0); + uint32x4_t line0_high = + interpolate_horizontal_high(v00, v10, v_xfrac1, v_nxfrac1); + uint32x4_t line1_high = + interpolate_horizontal_high(v01, v11, v_xfrac1, v_nxfrac1); + + uint32x4_t v_yfrac0 = vmovl_u16(vdup_n_u16(yfrac_pixel_0)); + uint32x4_t v_nyfrac0 = + vmovl_u16(vdup_n_u16(REMAP16POINT5_FRAC_MAX - yfrac_pixel_0)); + uint32x4_t v_yfrac1 = vmovl_u16(vdup_n_u16(yfrac_pixel_1)); + uint32x4_t v_nyfrac1 = + vmovl_u16(vdup_n_u16(REMAP16POINT5_FRAC_MAX - yfrac_pixel_1)); + + uint32x4_t lo = + interpolate_vertical(line0_low, line1_low, v_yfrac0, v_nyfrac0); + uint32x4_t hi = + interpolate_vertical(line0_high, line1_high, v_yfrac1, v_nyfrac1); + + // Discard upper 16 bits of each element (low the precision back to original + // 16 bits) + return vcombine(vmovn_u32(lo), vmovn_u32(hi)); +} + +inline uint64_t load_32bit(const uint8_t *src) { + uint32_t value = 0; + memcpy(&value, src, sizeof(uint32_t)); + return static_cast(value); +} + +inline uint8x16_t load_4px_4ch(Rows src_rows, + uint32x4_t offsets) { + uint64_t pixels01 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 0)]) | + (load_32bit(&src_rows[vgetq_lane_u32(offsets, 1)]) << 32); + uint64_t pixels23 = load_32bit(&src_rows[vgetq_lane_u32(offsets, 2)]) | + (load_32bit(&src_rows[vgetq_lane_u32(offsets, 3)]) << 32); + return vcombine(vcreate_u8(pixels01), vcreate_u8(pixels23)); +} + +inline void store_pixels_u8_4ch(uint8x16x2_t res, Columns dst) { + vst1q_u8_x2(&dst[0], res); +} + +inline uint16x8_t load_2px_4ch(Rows src_rows, + uint32x2_t offsets) { + return vcombine(vld1_u16(&src_rows[vget_lane_u32(offsets, 0)]), + vld1_u16(&src_rows[vget_lane_u32(offsets, 1)])); +} + +inline void store_pixels_u16_4ch(uint16x8x4_t res, Columns dst) { + vst1q_u16_x4(&dst[0], res); +} + +// Replicate border functions +inline void get_coordinates_replicate(Columns mapxy, + Columns mapfrac, + uint16x8_t &x0, uint16x8_t &y0, + uint16x8_t &x1, uint16x8_t &y1, + uint8x8_t &xfrac, uint8x8_t &yfrac, + int16x8_t v_xmax, int16x8_t v_ymax) { + get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); + + // Zero the xfrac (or yfrac) if x (or y) are below zero + xfrac = + vbsl_u8(vmovn_u16(vcltq_s16(x0, vdupq_n_s16(0))), vdup_n_u8(0), xfrac); + yfrac = + vbsl_u8(vmovn_u16(vcltq_s16(y0, vdupq_n_s16(0))), vdup_n_u8(0), yfrac); + + // Clamp coordinates to within the dimensions of the source image + x0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(x0, v_xmax))); + y0 = vreinterpretq_u16_s16(vmaxq_s16(vdupq_n_s16(0), vminq_s16(y0, v_ymax))); + + // x1 = x0 + 1, except if it's already xmax + x1 = vsubq_u16(x0, vcltq_s16(x0, v_xmax)); + y1 = vsubq_u16(y0, vcltq_s16(y0, v_ymax)); +} + +inline void load_pixels_u8_4ch_replicate( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint8x16_t &a, uint8x16_t &b, + uint8x16_t &c, uint8x16_t &d) { + a = load_4px_4ch(src_rows, offsets_a); + b = load_4px_4ch(src_rows, offsets_b); + c = load_4px_4ch(src_rows, offsets_c); + d = load_4px_4ch(src_rows, offsets_d); +} + +inline void load_pixels_u16_4ch_replicate( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x8_t &a_lo, + uint16x8_t &a_hi, uint16x8_t &b_lo, uint16x8_t &b_hi, uint16x8_t &c_lo, + uint16x8_t &c_hi, uint16x8_t &d_lo, uint16x8_t &d_hi) { + a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); + b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); + c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); + d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); + + a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); + b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); + c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); + d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); +} + +// Constant border functions +inline void get_coordinates_constant( + Columns mapxy, Columns mapfrac, + uint16x8_t v_width, uint16x8_t v_height, uint16x8_t &x0, uint16x8_t &y0, + uint16x8_t &x1, uint16x8_t &y1, uint8x8_t &xfrac, uint8x8_t &yfrac, + uint16x8_t &in_range_a, uint16x8_t &in_range_b, uint16x8_t &in_range_c, + uint16x8_t &in_range_d) { + get_coordinates(mapxy, mapfrac, x0, y0, xfrac, yfrac); + + uint16x8_t one = vdupq_n_u16(1); + x1 = vaddq_u16(x0, one); + y1 = vaddq_u16(y0, one); + + uint16x8_t x0_in_range = vcltq_u16(x0, v_width); + uint16x8_t y0_in_range = vcltq_u16(y0, v_height); + uint16x8_t x1_in_range = vcltq_u16(x1, v_width); + uint16x8_t y1_in_range = vcltq_u16(y1, v_height); + + in_range_a = vandq(x0_in_range, y0_in_range); + in_range_b = vandq(x1_in_range, y0_in_range); + in_range_c = vandq(x0_in_range, y1_in_range); + in_range_d = vandq(x1_in_range, y1_in_range); +} + +inline uint32x4_t zero_out_of_range_offsets(uint16x4_t in_range, + uint32x4_t offsets) { + return vbslq_u32(vmovl_u16(in_range), offsets, vdupq_n_u32(0)); +} + +inline uint8x16_t replace_pixel_with_border_u8_4ch(uint16x4_t in_range, + uint8x16_t pixels, + uint8x16_t v_border) { + uint8x16_t in_range_stretched = + vcombine(vmovn_u16(vcombine(vdup_n_u16(vget_lane_u16(in_range, 0)), + vdup_n_u16(vget_lane_u16(in_range, 1)))), + vmovn_u16(vcombine(vdup_n_u16(vget_lane_u16(in_range, 2)), + vdup_n_u16(vget_lane_u16(in_range, 3))))); + + return vbslq_u8(in_range_stretched, pixels, v_border); +} + +inline uint16x8_t replace_pixel_with_border_u16_4ch(uint16_t in_range0, + uint16_t in_range1, + uint16x8_t pixels, + uint16x8_t v_border) { + uint16x8_t in_range_stretched = + vcombine(vdup_n_u16(in_range0), vdup_n_u16(in_range1)); + return vbslq_u16(in_range_stretched, pixels, v_border); +} + +inline void load_pixels_u8_4ch_constant( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x4_t in_range_a, + uint16x4_t in_range_b, uint16x4_t in_range_c, uint16x4_t in_range_d, + uint8x16_t v_border, uint8x16_t &a, uint8x16_t &b, uint8x16_t &c, + uint8x16_t &d) { + offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); + offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); + offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); + offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); + + a = load_4px_4ch(src_rows, offsets_a); + b = load_4px_4ch(src_rows, offsets_b); + c = load_4px_4ch(src_rows, offsets_c); + d = load_4px_4ch(src_rows, offsets_d); + + a = replace_pixel_with_border_u8_4ch(in_range_a, a, v_border); + b = replace_pixel_with_border_u8_4ch(in_range_b, b, v_border); + c = replace_pixel_with_border_u8_4ch(in_range_c, c, v_border); + d = replace_pixel_with_border_u8_4ch(in_range_d, d, v_border); +} + +inline void load_pixels_u16_4ch_constant( + Rows src_rows, uint32x4_t offsets_a, uint32x4_t offsets_b, + uint32x4_t offsets_c, uint32x4_t offsets_d, uint16x4_t in_range_a, + uint16x4_t in_range_b, uint16x4_t in_range_c, uint16x4_t in_range_d, + uint16x8_t v_border, uint16x8_t &a_lo, uint16x8_t &a_hi, uint16x8_t &b_lo, + uint16x8_t &b_hi, uint16x8_t &c_lo, uint16x8_t &c_hi, uint16x8_t &d_lo, + uint16x8_t &d_hi) { + offsets_a = zero_out_of_range_offsets(in_range_a, offsets_a); + offsets_b = zero_out_of_range_offsets(in_range_b, offsets_b); + offsets_c = zero_out_of_range_offsets(in_range_c, offsets_c); + offsets_d = zero_out_of_range_offsets(in_range_d, offsets_d); + + a_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_a)); + b_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_b)); + c_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_c)); + d_lo = load_2px_4ch(src_rows, vget_low_u32(offsets_d)); + + a_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_a, 0), + vget_lane_u16(in_range_a, 1), a_lo, + v_border); + b_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_b, 0), + vget_lane_u16(in_range_b, 1), b_lo, + v_border); + c_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_c, 0), + vget_lane_u16(in_range_c, 1), c_lo, + v_border); + d_lo = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_d, 0), + vget_lane_u16(in_range_d, 1), d_lo, + v_border); + + a_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_a)); + b_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_b)); + c_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_c)); + d_hi = load_2px_4ch(src_rows, vget_high_u32(offsets_d)); + + a_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_a, 2), + vget_lane_u16(in_range_a, 3), a_hi, + v_border); + b_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_b, 2), + vget_lane_u16(in_range_b, 3), b_hi, + v_border); + c_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_c, 2), + vget_lane_u16(in_range_c, 3), c_hi, + v_border); + d_hi = replace_pixel_with_border_u16_4ch(vget_lane_u16(in_range_d, 2), + vget_lane_u16(in_range_d, 3), d_hi, + v_border); +} + +template +class RemapS16Point5Replicate4ch; + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, + v_xmax_, v_ymax_); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint8x16_t a, b, c, d; + uint8x16x2_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a, b, c, d); + res.val[0] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1), + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a, b, c, d); + res.val[1] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5), + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u8_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_xmax_{vdupq_n_s16(static_cast(src_width - 1))}, + v_ymax_{vdupq_n_s16(static_cast(src_height - 1))} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + get_coordinates_replicate(mapxy, mapfrac, x0, y0, x1, y1, xfrac, yfrac, + v_xmax_, v_ymax_); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; + uint16x8x4_t res; + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a_low, a_high, b_low, b_high, + c_low, c_high, d_low, d_high); + res.val[0] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1)); + res.val[1] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_replicate(src_rows_, offsets_a, offsets_b, offsets_c, + offsets_d, a_low, a_high, b_low, b_high, + c_low, c_high, d_low, d_high); + res.val[2] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5)); + res.val[3] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u16_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_element_stride_; + int16x8_t v_xmax_; + int16x8_t v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + +template +class RemapS16Point5Constant4ch; + +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_stride_{vdup_n_u16(static_cast(src_rows_.stride()))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdupq_n_u8(*border_value)} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; + get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, + y1, xfrac, yfrac, in_range_a, in_range_b, + in_range_c, in_range_d); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint8x16_t a, b, c, d; + uint8x16x2_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + + load_pixels_u8_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_low_u16(in_range_a), vget_low_u16(in_range_b), + vget_low_u16(in_range_c), vget_low_u16(in_range_d), v_border_, a, b, + c, d); + res.val[0] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1), + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_stride_); + load_pixels_u8_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_high_u16(in_range_a), vget_high_u16(in_range_b), + vget_high_u16(in_range_c), vget_high_u16(in_range_d), v_border_, a, b, + c, d); + res.val[1] = interpolate_u8_4ch( + a, b, c, d, vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5), + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u8_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint8x16_t v_border_; +}; // end of class RemapS16Point5Constant4ch + +template <> +class RemapS16Point5Constant4ch { + public: + using ScalarType = uint16_t; + using MapVecTraits = neon::VecTraits; + + RemapS16Point5Constant4ch(Rows src_rows, size_t src_width, + size_t src_height, const ScalarType *border_value) + : src_rows_{src_rows}, + v_src_element_stride_{vdup_n_u16( + static_cast(src_rows_.stride() / sizeof(ScalarType)))}, + v_width_{vdupq_n_u16(static_cast(src_width))}, + v_height_{vdupq_n_u16(static_cast(src_height))}, + v_border_{vdupq_n_u16(*border_value)} {} + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + auto vector_path = [&](size_t step) { + uint16x8_t x0, y0, x1, y1; + uint8x8_t xfrac, yfrac; + uint16x8_t in_range_a, in_range_b, in_range_c, in_range_d; + get_coordinates_constant(mapxy, mapfrac, v_width_, v_height_, x0, y0, x1, + y1, xfrac, yfrac, in_range_a, in_range_b, + in_range_c, in_range_d); + + uint32x4_t offsets_a, offsets_b, offsets_c, offsets_d; + uint16x8_t a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high; + uint16x8x4_t res; + + get_offsets_4ch(vget_low_u16(x0), vget_low_u16(y0), vget_low_u16(x1), + vget_low_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_low_u16(in_range_a), vget_low_u16(in_range_b), + vget_low_u16(in_range_c), vget_low_u16(in_range_d), v_border_, a_low, + a_high, b_low, b_high, c_low, c_high, d_low, d_high); + res.val[0] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 0), vget_lane_u8(yfrac, 0), + vget_lane_u8(xfrac, 1), vget_lane_u8(yfrac, 1)); + res.val[1] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 2), vget_lane_u8(yfrac, 2), + vget_lane_u8(xfrac, 3), vget_lane_u8(yfrac, 3)); + + get_offsets_4ch(vget_high_u16(x0), vget_high_u16(y0), vget_high_u16(x1), + vget_high_u16(y1), offsets_a, offsets_b, offsets_c, + offsets_d, v_src_element_stride_); + load_pixels_u16_4ch_constant( + src_rows_, offsets_a, offsets_b, offsets_c, offsets_d, + vget_high_u16(in_range_a), vget_high_u16(in_range_b), + vget_high_u16(in_range_c), vget_high_u16(in_range_d), v_border_, + a_low, a_high, b_low, b_high, c_low, c_high, d_low, d_high); + res.val[2] = + interpolate_u16_4ch(a_low, c_low, b_low, d_low, + vget_lane_u8(xfrac, 4), vget_lane_u8(yfrac, 4), + vget_lane_u8(xfrac, 5), vget_lane_u8(yfrac, 5)); + res.val[3] = + interpolate_u16_4ch(a_high, c_high, b_high, d_high, + vget_lane_u8(xfrac, 6), vget_lane_u8(yfrac, 6), + vget_lane_u8(xfrac, 7), vget_lane_u8(yfrac, 7)); + + store_pixels_u16_4ch(res, dst); + mapxy += ptrdiff_t(step); + mapfrac += ptrdiff_t(step); + dst += ptrdiff_t(step); + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once(vector_path); + ptrdiff_t back_step = static_cast(loop.step()) - + static_cast(loop.remaining_length()); + mapxy -= back_step; + mapfrac -= back_step; + dst -= back_step; + loop.remaining([&](size_t, size_t step) { vector_path(step); }); + } + + private: + Rows src_rows_; + uint16x4_t v_src_element_stride_; + uint16x8_t v_width_; + uint16x8_t v_height_; + uint16x8_t v_border_; +}; + // Most of the complexity comes from parameter checking. // NOLINTBEGIN(readability-function-cognitive-complexity) template @@ -656,13 +1320,26 @@ kleidicv_error_t remap_s16point5( Rows dst_rows{dst, dst_stride, channels}; Rectangle rect{dst_width, dst_height}; if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { - RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, - border_value}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5ConstantBorder operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + RemapS16Point5Constant4ch operation{src_rows, src_width, src_height, + border_value}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); - RemapS16Point5Replicate operation{src_rows, src_width, src_height}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5Replicate operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + RemapS16Point5Replicate4ch operation{src_rows, src_width, src_height}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } } return KLEIDICV_OK; } diff --git a/kleidicv/src/transform/remap_sc.h b/kleidicv/src/transform/remap_sc.h index a0e27f1696bb2b418d7def0c95c478ada2664520..269757fec251cb8859f62c38d39a8f12af2624cb 100644 --- a/kleidicv/src/transform/remap_sc.h +++ b/kleidicv/src/transform/remap_sc.h @@ -580,6 +580,429 @@ class RemapS16Point5Replicate { MapVectorType& v_ymax_; }; // end of class RemapS16Point5Replicate +template +class RemapS16Point5Replicate4ch; + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint8_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_stride_ = svdup_u16(src_rows.stride()); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = MapVecTraits::svptrue(); + vector_path(pg, mapxy, mapfrac, dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + vector_path(pg, mapxy, mapfrac, dst, static_cast(length)); + }); + } + + void vector_path(svbool_t pg, Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + // Clamp coordinates to within the dimensions of the source image + svuint16_t x0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint16_t x1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); + + svuint16_t y1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); + svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); + svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); + + //// NEW PART + // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 + // channels + auto load_4ch_b = [&](svuint16_t x, svuint16_t y) { + return svreinterpret_u8_u32(svld1_gather_u32offset_u32( + pg_b, reinterpret_cast(&src_rows_[0]), + svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_))); + }; + auto load_4ch_t = [&](svuint16_t x, svuint16_t y) { + return svreinterpret_u8_u32(svld1_gather_u32offset_u32( + pg_t, reinterpret_cast(&src_rows_[0]), + svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_))); + }; + + FracVectorType frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t xfrac = + svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint16_t line0 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); + svuint16_t line1 = svmla_x( + svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); + svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); + acc_b = svmlalb_u32(acc_b, line1, yfrac); + acc_t = svmlalt_u32(acc_t, line1, yfrac); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // bottom part + svuint8_t a = load_4ch_b(x0, y0); + svuint8_t b = load_4ch_b(x1, y0); + svuint8_t c = load_4ch_b(x0, y1); + svuint8_t d = load_4ch_b(x1, y1); + // from xfrac, we need the bottom part twice + svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); + svuint16_t nxfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); + svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); + svuint16_t nyfrac2b = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_bb = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_bt = + lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_b = + svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); + + // top part + a = load_4ch_t(x0, y0); + b = load_4ch_t(x1, y0); + c = load_4ch_t(x0, y1); + d = load_4ch_t(x1, y1); + // from xfrac, we need the top part twice + svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); + svuint16_t nxfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); + svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); + svuint16_t nyfrac2t = svsub_u16_x( + svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); + + // a,b,c,d looks like 12341234...(four channels) + // bottom is 1313... + svuint16_t res_tb = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), + svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); + // top is 2424... + svuint16_t res_tt = + lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), + svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); + svuint8_t res_t = + svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); + + svbool_t pg_low = svwhilelt_b32(0L, step); + svbool_t pg_high = svwhilelt_b32(svcntw(), static_cast(step)); + svuint32_t res_low = + svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + svuint32_t res_high = + svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); + mapxy += step; + svst1_u32(pg_low, reinterpret_cast(&dst[0]), res_low); + svst1_u32(pg_high, reinterpret_cast(&dst[0]) + svcntw(), + res_high); + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + +template <> +class RemapS16Point5Replicate4ch { + public: + using ScalarType = uint16_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + using FracVecTraits = VecTraits; + using FracVectorType = typename FracVecTraits::VectorType; + + RemapS16Point5Replicate4ch(Rows src_rows, size_t src_width, + size_t src_height, svuint16_t& v_src_stride, + MapVectorType& v_x_max, MapVectorType& v_y_max) + : src_rows_{src_rows}, + v_src_element_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} { + v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); + v_xmax_ = svdup_s16(static_cast(src_width - 1)); + v_ymax_ = svdup_s16(static_cast(src_height - 1)); + } + + void process_row(size_t width, Columns mapxy, + Columns mapfrac, Columns dst) { + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) { + svbool_t pg = MapVecTraits::svptrue(); + vector_path(pg, mapxy, mapfrac, dst, static_cast(step)); + }); + loop.remaining([&](size_t length, size_t step) { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + vector_path(pg, mapxy, mapfrac, dst, static_cast(length)); + }); + } + + void vector_path(svbool_t pg, Columns& mapxy, + Columns& mapfrac, Columns& dst, + ptrdiff_t step) { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); + + // Clamp coordinates to within the dimensions of the source image + svuint16_t x0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y0 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); + + // x1 = x0 + 1, and clamp it too + svuint16_t x1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); + + svuint16_t y1 = svreinterpret_u16_s16( + svmax_x(pg, svdup_n_s16(0), + svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); + + auto load_4ch_b = [&](svbool_t pg, svuint32_t offsets) { + return svreinterpret_u16_u64(svld1_gather_u64offset_u64( + pg, reinterpret_cast(&src_rows_[0]), + svshllb_n_u64(offsets, 1))); + }; + auto load_4ch_t = [&](svbool_t pg, svuint32_t offsets) { + return svreinterpret_u16_u64(svld1_gather_u64offset_u64( + pg, reinterpret_cast(&src_rows_[0]), + svshllt_n_u64(offsets, 1))); + }; + + FracVectorType frac = svld1_u16(pg, &mapfrac[0]); + svuint16_t xfrac = + svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t yfrac = + svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), + svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); + svuint16_t nxfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); + svuint16_t nyfrac = + svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); + + auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, + svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, + svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { + svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); + svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); + svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); + svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); + + svuint32_t acc_b = + svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); + svuint32_t acc_t = + svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); + acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); + acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); + + return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, + 2ULL * REMAP16POINT5_FRAC_BITS); + }; + + // There are 4 channels: data is 4 times wider than the maps (4x16 bits vs + // 16-bit coordinates) So calculation is done in 4 parts: + // - 0,4,8,... (bottom-bottom) + // - 2,6,10,... (bottom-top) + // - 1,5,9,... (top-bottom) + // - 3,7,11,... (top-top) + svuint16_t res_bb, res_bt, res_tb, res_tt; + + svuint16_t fractbl_bb = + svreinterpret_u16_u64(svindex_u64(0, 0x0004000400040004UL)); + svuint16_t fractbl_bt = svreinterpret_u16_u64( + svadd_n_u64_x(svptrue_b64(), svreinterpret_u64_u16(fractbl_bb), + 0x0002000200020002UL)); + svuint16_t fractbl_tb = svreinterpret_u16_u64( + svadd_n_u64_x(svptrue_b64(), svreinterpret_u64_u16(fractbl_bb), + 0x0001000100010001UL)); + svuint16_t fractbl_tt = svreinterpret_u16_u64( + svadd_n_u64_x(svptrue_b64(), svreinterpret_u64_u16(fractbl_bb), + 0x0003000300030003UL)); + + { // bottom + svbool_t pg_bb = svwhilelt_b64(int64_t{0}, (step + 3) / 4); + svbool_t pg_bt = svwhilelt_b64(int64_t{0}, (step + 2) / 4); + + svuint32_t offsets_a_b = + svmlalb_u32(svshllb_n_u32(x0, 2), y0, v_src_element_stride_); + svuint32_t offsets_b_b = + svmlalb_u32(svshllb_n_u32(x1, 2), y0, v_src_element_stride_); + svuint32_t offsets_c_b = + svmlalb_u32(svshllb_n_u32(x0, 2), y1, v_src_element_stride_); + svuint32_t offsets_d_b = + svmlalb_u32(svshllb_n_u32(x1, 2), y1, v_src_element_stride_); + + { // bottom-bottom + svuint16_t a = load_4ch_b(pg_bb, offsets_a_b); + svuint16_t b = load_4ch_b(pg_bb, offsets_b_b); + svuint16_t c = load_4ch_b(pg_bb, offsets_c_b); + svuint16_t d = load_4ch_b(pg_bb, offsets_d_b); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_bb); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_bb); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_bb); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_bb); + + res_bb = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + { // bottom-top + svuint16_t a = load_4ch_t(pg_bt, offsets_a_b); + svuint16_t b = load_4ch_t(pg_bt, offsets_b_b); + svuint16_t c = load_4ch_t(pg_bt, offsets_c_b); + svuint16_t d = load_4ch_t(pg_bt, offsets_d_b); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_bt); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_bt); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_bt); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_bt); + + res_bt = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + } + + { // top + svbool_t pg_tb = svwhilelt_b64(int64_t{0}, (step + 1) / 4); + svbool_t pg_tt = svwhilelt_b64(int64_t{0}, step / 4); + + svuint32_t offsets_a_t = + svmlalt_u32(svshllt_n_u32(x0, 2), y0, v_src_element_stride_); + svuint32_t offsets_b_t = + svmlalt_u32(svshllt_n_u32(x1, 2), y0, v_src_element_stride_); + svuint32_t offsets_c_t = + svmlalt_u32(svshllt_n_u32(x0, 2), y1, v_src_element_stride_); + svuint32_t offsets_d_t = + svmlalt_u32(svshllt_n_u32(x1, 2), y1, v_src_element_stride_); + + { // top-bottom + svuint16_t a = load_4ch_b(pg_tb, offsets_a_t); + svuint16_t b = load_4ch_b(pg_tb, offsets_b_t); + svuint16_t c = load_4ch_b(pg_tb, offsets_c_t); + svuint16_t d = load_4ch_b(pg_tb, offsets_d_t); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_tb); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_tb); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_tb); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_tb); + + res_tb = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + + { // top-top + svuint16_t a = load_4ch_t(pg_tt, offsets_a_t); + svuint16_t b = load_4ch_t(pg_tt, offsets_b_t); + svuint16_t c = load_4ch_t(pg_tt, offsets_c_t); + svuint16_t d = load_4ch_t(pg_tt, offsets_d_t); + + svuint16_t xfr = svtbl_u16(xfrac, fractbl_tt); + svuint16_t nxfr = svtbl_u16(nxfrac, fractbl_tt); + svuint16_t yfr = svtbl_u16(yfrac, fractbl_tt); + svuint16_t nyfr = svtbl_u16(nyfrac, fractbl_tt); + + res_tt = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); + } + } + + svbool_t pg_00 = svwhilelt_b64(0L, step); + svbool_t pg_01 = svwhilelt_b64(svcntd(), static_cast(step)); + svbool_t pg_02 = svwhilelt_b64(2 * svcntd(), static_cast(step)); + svbool_t pg_03 = svwhilelt_b64(3 * svcntd(), static_cast(step)); + + // Back-transforming is needed, svst4 cannot be used because the + // interleaving would need equal number of elements in the 4 vectors the + // results are now these: + // - 0,4,8,... (bottom-bottom) + // - 2,6,10,... (bottom-top) + // - 1,5,9,... (top-bottom) + // - 3,7,11,... (top-top) + // first pass will result in: + // - 0,2,4,... (lower and higher ones) + // - 1,3,5,... (lower and higher ones) + // second pass gives back 0,1,2,3,.... + + svuint64_t res_even_low = svzip1_u64(svreinterpret_u64_u16(res_bb), + svreinterpret_u64_u16(res_bt)); + svuint64_t res_even_high = svzip2_u64(svreinterpret_u64_u16(res_bb), + svreinterpret_u64_u16(res_bt)); + svuint64_t res_odd_low = svzip1_u64(svreinterpret_u64_u16(res_tb), + svreinterpret_u64_u16(res_tt)); + svuint64_t res_odd_high = svzip2_u64(svreinterpret_u64_u16(res_tb), + svreinterpret_u64_u16(res_tt)); + + svuint64_t res_00 = svzip1_u64(res_even_low, res_odd_low); + svuint64_t res_01 = svzip2_u64(res_even_low, res_odd_low); + svuint64_t res_02 = svzip1_u64(res_even_high, res_odd_high); + svuint64_t res_03 = svzip2_u64(res_even_high, res_odd_high); + + svst1_u64(pg_00, reinterpret_cast(&dst[0]), res_00); + svst1_u64(pg_01, reinterpret_cast(&dst[0]) + svcntd(), res_01); + svst1_u64(pg_02, reinterpret_cast(&dst[0]) + 2 * svcntd(), + res_02); + svst1_u64(pg_03, reinterpret_cast(&dst[0]) + 3 * svcntd(), + res_03); + + mapxy += step; + mapfrac += step; + dst += step; + } + + Rows src_rows_; + + private: + svuint16_t& v_src_element_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16Point5Replicate4ch + template class RemapS16Point5ConstantBorder; @@ -808,16 +1231,29 @@ kleidicv_error_t remap_s16point5_sc( if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { svuint16_t sv_width, sv_height, sv_border; - RemapS16Point5ConstantBorder operation{ - src_rows, src_width, src_height, border_value, - sv_src_stride, sv_width, sv_height, sv_border}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5ConstantBorder operation{ + src_rows, src_width, src_height, border_value, + sv_src_stride, sv_width, sv_height, sv_border}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + // TODO + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } } else { assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); svint16_t sv_xmax, sv_ymax; - RemapS16Point5Replicate operation{src_rows, src_width, src_height, - sv_src_stride, sv_xmax, sv_ymax}; - zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + if (channels == 1) { + RemapS16Point5Replicate operation{src_rows, src_width, src_height, + sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } else { + assert(channels == 4); + RemapS16Point5Replicate4ch operation{ + src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax}; + zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); + } } return KLEIDICV_OK; } diff --git a/test/api/test_remap.cpp b/test/api/test_remap.cpp index 7a18a0bcf113bde7e967345fc3dd263d57ef25d2..a7bcc822243e186e9a02b79a040156f1268704bf 100644 --- a/test/api/test_remap.cpp +++ b/test/api/test_remap.cpp @@ -35,18 +35,23 @@ KLEIDICV_REMAP_F32(uint16_t, u16); template static const ScalarType *get_array2d_element_or_border( const test::Array2D &src, ptrdiff_t x, ptrdiff_t y, - kleidicv_border_type_t border_type, const ScalarType *border_value) { + ptrdiff_t ch, kleidicv_border_type_t border_type, + const ScalarType *border_value) { + // Width is the number of pixels in a row, but Array2D does not handle that + const ptrdiff_t src_width = + static_cast(src.width() / src.channels()); + if (border_type == KLEIDICV_BORDER_TYPE_REPLICATE) { - x = std::clamp(x, 0, static_cast(src.width()) - 1); + x = std::clamp(x, 0, src_width - 1); y = std::clamp(y, 0, static_cast(src.height()) - 1); } else { assert(border_type == KLEIDICV_BORDER_TYPE_CONSTANT); - if (x >= static_cast(src.width()) || - y >= static_cast(src.height()) || x < 0 || y < 0) { + if (x >= src_width || y >= static_cast(src.height()) || x < 0 || + y < 0) { return border_value; } } - return src.at(y, x * src.channels()); + return src.at(y, x * src.channels() + ch); } template @@ -146,12 +151,11 @@ class RemapS16 : public testing::Test { calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, remap_s16()( + source.data(), source.stride(), src_w, + source.height(), actual.data(), actual.stride(), + dst_w, actual.height(), channels, mapxy.data(), + mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -175,12 +179,11 @@ class RemapS16 : public testing::Test { calculate_expected(source, mapxy, border_type, border_value, expected); - ASSERT_EQ( - KLEIDICV_OK, - remap_s16()( - source.data(), source.stride(), source.width(), source.height(), - actual.data(), actual.stride(), actual.width(), actual.height(), - channels, mapxy.data(), mapxy.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, remap_s16()( + source.data(), source.stride(), src_w, + source.height(), actual.data(), actual.stride(), + dst_w, actual.height(), channels, mapxy.data(), + mapxy.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -189,9 +192,9 @@ class RemapS16 : public testing::Test { kleidicv_border_type_t border_type, const ScalarType *border_value, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -200,7 +203,7 @@ class RemapS16 : public testing::Test { for (size_t ch = 0; ch < src.channels(); ++ch) { const int16_t *coords = mapxy.at(row, column * 2); int16_t x = coords[0], y = coords[1]; - *expected.at(row, column * src.channels() + ch) = get_src(x, y)[ch]; + *expected.at(row, column * src.channels() + ch) = *get_src(x, y, ch); } } } @@ -524,8 +527,9 @@ class RemapS16Point5 : public testing::Test { } } - // This part is the same as execute_test() but without initializing source. - // Corner Cases use the biggest possible source. + // This part is the same as execute_test() except source initialization. + // Corner Cases use the biggest possible source, so it is only initializing + // the edges. size_t src_total_width = channels * src_w; size_t dst_total_width = channels * dst_w; @@ -559,12 +563,12 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, - mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_s16point5()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, actual, expected); } @@ -589,12 +593,12 @@ class RemapS16Point5 : public testing::Test { calculate_expected(source, mapxy, mapfrac, border_type, border_value, expected); - ASSERT_EQ(KLEIDICV_OK, remap_s16point5()( - source.data(), source.stride(), source.width(), - source.height(), actual.data(), actual.stride(), - actual.width(), actual.height(), channels, - mapxy.data(), mapxy.stride(), mapfrac.data(), - mapfrac.stride(), border_type, border_value)); + ASSERT_EQ(KLEIDICV_OK, + remap_s16point5()( + source.data(), source.stride(), src_w, source.height(), + actual.data(), actual.stride(), dst_w, actual.height(), + channels, mapxy.data(), mapxy.stride(), mapfrac.data(), + mapfrac.stride(), border_type, border_value)); EXPECT_EQ_ARRAY2D(actual, expected); } @@ -615,28 +619,28 @@ class RemapS16Point5 : public testing::Test { kleidicv_border_type_t border_type, const ScalarType *border_value, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { for (size_t column = 0; column < expected.width() / src.channels(); ++column) { + // Clang-tidy thinks mapfrac may contain garbage, but it is fully + // initialized at all code paths and the map size always equals dst + // map pixel size + // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) + uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1); + uint8_t y_frac = + (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); + // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) + const int16_t *coords = mapxy.at(row, column * 2); + ptrdiff_t x = coords[0], y = coords[1]; for (size_t ch = 0; ch < src.channels(); ++ch) { - // Clang-tidy thinks mapfrac may contain garbage, but it is fully - // initialized at all code paths and the map size always equals dst - // map pixel size - // NOLINTBEGIN(clang-analyzer-core.UndefinedBinaryOperatorResult) - uint8_t x_frac = *mapfrac.at(row, column) & (FRAC_MAX - 1); - uint8_t y_frac = - (*mapfrac.at(row, column) >> FRAC_BITS) & (FRAC_MAX - 1); - // NOLINTEND(clang-analyzer-core.UndefinedBinaryOperatorResult) - const int16_t *coords = mapxy.at(row, column * 2); - int16_t x = coords[0], y = coords[1]; *expected.at(row, column * src.channels() + ch) = - lerp2d(x_frac, y_frac, get_src(x, y)[ch], get_src(x + 1, y)[ch], - get_src(x, y + 1)[ch], get_src(x + 1, y + 1)[ch]); + lerp2d(x_frac, y_frac, *get_src(x, y, ch), *get_src(x + 1, y, ch), + *get_src(x, y + 1, ch), *get_src(x + 1, y + 1, ch)); } } } @@ -646,48 +650,74 @@ class RemapS16Point5 : public testing::Test { using RemapS16Point5ElementTypes = ::testing::Types; TYPED_TEST_SUITE(RemapS16Point5, RemapS16Point5ElementTypes); +template +size_t defaultWidth() { + return 3 * test::Options::vector_lanes() - 1; +} + +size_t defaultHeight() { return 4; } + TYPED_TEST(RemapS16Point5, RandomNoPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; + size_t w = defaultWidth(); + size_t h = defaultHeight(); + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_random(w, h, w, h, 1, border_type, border_value, 0); + } +} + +TYPED_TEST(RemapS16Point5, RandomNoPadding4ch) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; size_t padding = 0; for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_random(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, padding); + TestFixture::test_random(w, h, w, h, channels, border_type, border_value, + padding); } } TYPED_TEST(RemapS16Point5, BlendPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 13; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_blend(src_w, src_h, dst_w, dst_h, channels, border_type, - border_value, padding); + TestFixture::test_blend(w, h, w, h, 1, border_type, border_value, 13); + } +} + +TYPED_TEST(RemapS16Point5, BlendPadding4ch) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 7; + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(w, h, w, h, channels, border_type, border_value, + padding); } } TYPED_TEST(RemapS16Point5, OutsideRandomPadding) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 4; - size_t dst_w = src_w; - size_t dst_h = src_h; - size_t channels = 1; - size_t padding = 13; + size_t w = defaultWidth(); + size_t h = defaultHeight(); for (auto [border_type, border_value] : get_borders()) { - TestFixture::test_outside_random(src_w, src_h, dst_w, dst_h, channels, - border_type, border_value, padding); + TestFixture::test_outside_random(w, h, w, h, 1, border_type, border_value, + 13); + } +} + +TYPED_TEST(RemapS16Point5, OutsideRandomPadding4ch) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = 11; + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_outside_random(w, h, w, h, channels, border_type, + border_value, padding); } } TYPED_TEST(RemapS16Point5, BlendBigStride) { - size_t src_w = 3 * test::Options::vector_lanes() - 1; - size_t src_h = 16; + size_t src_w = defaultWidth(); + size_t src_h = defaultHeight(); size_t dst_w = src_w; size_t dst_h = src_h; size_t channels = 1; @@ -698,6 +728,18 @@ TYPED_TEST(RemapS16Point5, BlendBigStride) { } } +TYPED_TEST(RemapS16Point5, BlendBigStride4ch) { + size_t w = defaultWidth(); + size_t h = defaultHeight(); + size_t channels = 4; + size_t padding = + std::numeric_limits::max() / channels - w * channels; + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_blend(w, h, w, h, channels, border_type, border_value, + padding); + } +} + TYPED_TEST(RemapS16Point5, CornerCases) { size_t src_w = std::numeric_limits::max() + 1; size_t src_h = std::numeric_limits::max() + 1; @@ -711,6 +753,19 @@ TYPED_TEST(RemapS16Point5, CornerCases) { } } +TYPED_TEST(RemapS16Point5, CornerCases4ch) { + size_t src_w = 100; + size_t src_h = 8; + size_t dst_w = defaultWidth(); + size_t dst_h = defaultHeight(); + size_t channels = 4; + size_t padding = 17; + for (auto [border_type, border_value] : get_borders()) { + TestFixture::test_corner_cases(src_w, src_h, dst_w, dst_h, channels, + border_type, border_value, padding); + } +} + TYPED_TEST(RemapS16Point5, NullPointer) { const TypeParam src[4] = {}; TypeParam dst[1]; @@ -819,17 +874,20 @@ TYPED_TEST(RemapS16Point5, UnsupportedBigStride) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } -TYPED_TEST(RemapS16Point5, UnsupportedTwoChannels) { +TYPED_TEST(RemapS16Point5, UnsupportedChannels) { const TypeParam src[1] = {}; TypeParam dst[8]; int16_t mapxy[16] = {}; uint16_t mapfrac[8] = {}; - EXPECT_EQ( - KLEIDICV_ERROR_NOT_IMPLEMENTED, - remap_s16point5()( - src, 1 * sizeof(TypeParam), 1, 1, dst, 8 * sizeof(TypeParam), 8, 1, 2, - mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1, 1, 1, dst, 8, 8, 1, 2, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); + EXPECT_EQ(KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, 1, 1, 1, dst, 8, 8, 1, 3, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr)); } TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { @@ -844,6 +902,21 @@ TYPED_TEST(RemapS16Point5, UnsupportedBorderType) { 1, 1, mapxy, 4, mapfrac, 2, KLEIDICV_BORDER_TYPE_REFLECT, src)); } +TYPED_TEST(RemapS16Point5, UnsupportedBigStride4ch) { + const TypeParam src[1] = {}; + TypeParam dst[8]; + int16_t mapxy[16] = {}; + uint16_t mapfrac[8] = {}; + + EXPECT_EQ( + KLEIDICV_ERROR_NOT_IMPLEMENTED, + remap_s16point5()( + src, + (std::numeric_limits::max() / 4 + 1L) * sizeof(TypeParam), + 1, 1, dst, 8, 8, 1, 4, mapxy, 4, mapfrac, 2, + KLEIDICV_BORDER_TYPE_CONSTANT, src)); +} + TYPED_TEST(RemapS16Point5, UnsupportedTooSmallImage) { const TypeParam src[1] = {}; TypeParam dst[8]; @@ -1081,9 +1154,9 @@ class RemapF32 : public testing::Test { const ScalarType *border_value, kleidicv_interpolation_type_t interpolation, test::Array2D &expected) { - auto get_src = [&](ptrdiff_t x, ptrdiff_t y) { - return get_array2d_element_or_border(src, x, y, border_type, - border_value); + auto get_src = [&](ptrdiff_t x, ptrdiff_t y, size_t ch) { + return get_array2d_element_or_border(src, x, y, ptrdiff_t(ch), + border_type, border_value); }; for (size_t row = 0; row < expected.height(); row++) { @@ -1104,10 +1177,10 @@ class RemapF32 : public testing::Test { float xfrac = x - std::floor(x); float yfrac = y - std::floor(y); for (size_t ch = 0; ch < src.channels(); ++ch) { - float a = get_src(ix, iy)[ch]; - float b = get_src(ix + 1, iy)[ch]; - float c = get_src(ix, iy + 1)[ch]; - float d = get_src(ix + 1, iy + 1)[ch]; + float a = *get_src(ix, iy, ch); + float b = *get_src(ix + 1, iy, ch); + float c = *get_src(ix, iy + 1, ch); + float d = *get_src(ix + 1, iy + 1, ch); float line1 = (b - a) * xfrac + a; float line2 = (d - c) * xfrac + c; float float_result = (line2 - line1) * yfrac + line1; @@ -1126,7 +1199,7 @@ class RemapF32 : public testing::Test { static_cast(KLEIDICV_MAX_IMAGE_PIXELS)))); for (size_t ch = 0; ch < src.channels(); ++ch) { *expected.at(row, column * src.channels() + ch) = - get_src(ix, iy)[ch]; + *get_src(ix, iy, ch); } } } diff --git a/test/api/test_thread.cpp b/test/api/test_thread.cpp index 128cf7af7209275f708135dd0f1d263008231654..e7214531d3313446c869e1d1c033cd0364c5c2de 100644 --- a/test/api/test_thread.cpp +++ b/test/api/test_thread.cpp @@ -722,12 +722,24 @@ TEST_P(Thread, remap_s16point5_u8_border_replicate) { KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u8_border_replicate_4ch) { + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + TEST_P(Thread, remap_s16point5_u16_border_replicate) { check_remap_s16point5(kleidicv_remap_s16point5_u16, kleidicv_thread_remap_s16point5_u16, 1, KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); } +TEST_P(Thread, remap_s16point5_u16_border_replicate_4ch) { + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 4, + KLEIDICV_BORDER_TYPE_REPLICATE, nullptr); +} + TEST_P(Thread, remap_s16point5_u8_border_constant) { const uint8_t border_value = 0; check_remap_s16point5(kleidicv_remap_s16point5_u8, @@ -735,6 +747,13 @@ TEST_P(Thread, remap_s16point5_u8_border_constant) { KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } +TEST_P(Thread, remap_s16point5_u8_border_constant_4ch) { + const uint8_t border_value = 0; + check_remap_s16point5(kleidicv_remap_s16point5_u8, + kleidicv_thread_remap_s16point5_u8, 4, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + TEST_P(Thread, remap_s16point5_u16_border_constant) { const uint16_t border_value = 0; check_remap_s16point5(kleidicv_remap_s16point5_u16, @@ -742,6 +761,13 @@ TEST_P(Thread, remap_s16point5_u16_border_constant) { KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); } +TEST_P(Thread, remap_s16point5_u16_border_constant_4ch) { + const uint16_t border_value = 0; + check_remap_s16point5(kleidicv_remap_s16point5_u16, + kleidicv_thread_remap_s16point5_u16, 4, + KLEIDICV_BORDER_TYPE_CONSTANT, &border_value); +} + TEST_P(Thread, remap_s16point5_u8_not_implemented) { const uint8_t border_value = 0; check_remap_s16point5_not_implemented(