diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index f3b034c802d461aa5e87a40cd73823e00e5fbf98..9f8a56b169d1af2daa2663263887d7e85109414a 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -473,3 +473,110 @@ static void blur_and_downsample_u8(benchmark::State& state) { (void)kleidicv_filter_context_release(context); } BENCHMARK(blur_and_downsample_u8); + +template +static const ScalarType* get_random_mapxy() { + auto generate_mapxy = [&]() { + std::vector v(image_height * image_width * 2); + std::mt19937_64 rng; + std::uniform_int_distribution dist_x(0, image_width), + dist_y(0, image_height); + for (size_t row = 0; row < image_height; ++row) { + for (size_t column = 0; column < image_width; ++column) { + size_t index = row * image_width + column; + v[2 * index] = dist_x(rng); + v[2 * index + 1] = dist_y(rng); + } + } + return v; + }; + static std::vector mapxy = generate_mapxy(); + return mapxy.data(); +} + +template +static const ScalarType* get_blend_mapxy() { + auto generate_mapxy = [&]() { + std::vector v(image_height * image_width * 2); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_width + column; + // Use a second degree function to add a nonlinear blend to the image + v[2 * index] = + static_cast(column * 2 - column * column / image_width); + v[2 * index + 1] = + static_cast(row * (image_width - column) / image_width + + 4 * row / image_height); + } + } + return v; + }; + static std::vector mapxy = generate_mapxy(); + return mapxy.data(); +} + +template +static const ScalarType* get_flip_mapxy() { + auto generate_mapxy = [&]() { + std::vector v(image_height * image_width * 2); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_width + column; + v[2 * index] = static_cast(image_width - column - 1); + v[2 * index + 1] = static_cast(row); + } + } + return v; + }; + static std::vector mapxy = generate_mapxy(); + return mapxy.data(); +} + +template +static const ScalarType* get_identity_mapxy() { + auto generate_mapxy = [&]() { + std::vector v(image_height * image_width * 2); + for (int row = 0; row < static_cast(image_height); ++row) { + for (int column = 0; column < static_cast(image_width); ++column) { + size_t index = row * image_width + column; + v[2 * index] = static_cast(column); + v[2 * index + 1] = static_cast(row); + } + } + return v; + }; + static std::vector mapxy = generate_mapxy(); + return mapxy.data(); +} + +template +static void remap_s16(Function f, MapFunc mf, size_t channels, + kleidicv_border_type_t border_type, + benchmark::State& state) { + bench_functor(state, [f, mf, channels, border_type]() { + (void)f(get_source_buffer_a(), image_width * sizeof(T), image_width, + image_height, get_destination_buffer(), + image_width * sizeof(T), image_width, image_height, channels, mf(), + image_width * 2 * sizeof(int16_t), border_type, + kleidicv_border_values_t{}); + }); +} + +#define BENCH_REMAP_S16(benchname, name, mapfunc, channels, border_type, type) \ + static void benchname(benchmark::State& state) { \ + remap_s16(kleidicv_##name, mapfunc, channels, border_type, state); \ + } \ + BENCHMARK(benchname) + +BENCH_REMAP_S16(remap_s16_u8_random, remap_s16_u8, get_random_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_REMAP_S16(remap_s16_u8_blend, remap_s16_u8, get_blend_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_REMAP_S16(remap_s16_u8_flip, remap_s16_u8, get_flip_mapxy, 1, + KLEIDICV_BORDER_TYPE_REPLICATE, uint8_t); + +BENCH_REMAP_S16(remap_s16_u8_identity, remap_s16_u8, + get_identity_mapxy, 1, KLEIDICV_BORDER_TYPE_REPLICATE, + uint8_t); diff --git a/kleidicv/include/kleidicv/remap/remap.h b/kleidicv/include/kleidicv/remap/remap.h index 7af219ba4d3eaba37acf8ba5cc679dbf12d5a62a..115f940733551e39a6c23474240302940074bbb6 100644 --- a/kleidicv/include/kleidicv/remap/remap.h +++ b/kleidicv/include/kleidicv/remap/remap.h @@ -12,9 +12,9 @@ namespace kleidicv { template -inline bool remap_s16_is_implemented(size_t dst_width, - kleidicv_border_type_t border_type, - size_t channels) { +inline bool remap_s16_is_implemented( + size_t dst_width, kleidicv_border_type_t border_type, + size_t channels) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value) { return (dst_width >= 8 && border_type == @@ -26,9 +26,9 @@ inline bool remap_s16_is_implemented(size_t dst_width, } template -inline bool remap_s16point5_is_implemented(size_t dst_width, - kleidicv_border_type_t border_type, - size_t channels) { +inline bool remap_s16point5_is_implemented( + size_t dst_width, kleidicv_border_type_t border_type, + size_t channels) KLEIDICV_STREAMING_COMPATIBLE { if constexpr (std::is_same::value) { return (dst_width >= 8 && border_type == @@ -60,6 +60,18 @@ kleidicv_error_t remap_s16point5(const T *src, size_t src_stride, kleidicv_border_values_t border_values); } // namespace neon +namespace sve2 { + +template +kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, + size_t src_height, T *dst, size_t dst_stride, + size_t dst_width, size_t dst_height, size_t channels, + const int16_t *mapxy, size_t mapxy_stride, + kleidicv_border_type_t border_type, + kleidicv_border_values_t border_values); + +} // namespace sve2 + } // namespace kleidicv #endif // KLEIDICV_REMAP_REMAP_H diff --git a/kleidicv/src/remap/remap_api.cpp b/kleidicv/src/remap/remap_api.cpp index cd29673c01cadc1a81de12802bf78b32bc563b86..7d4c8cec85cb5c89d85a323d3499555b9b8a76c9 100644 --- a/kleidicv/src/remap/remap_api.cpp +++ b/kleidicv/src/remap/remap_api.cpp @@ -6,9 +6,10 @@ #include "kleidicv/kleidicv.h" #include "kleidicv/remap/remap.h" -#define KLEIDICV_DEFINE_C_API(outer_name, inner_name, type) \ - KLEIDICV_MULTIVERSION_C_API(outer_name, &kleidicv::neon::inner_name, \ - nullptr, nullptr) +KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16_u8, + &kleidicv::neon::remap_s16, + &kleidicv::sve2::remap_s16, nullptr); -KLEIDICV_DEFINE_C_API(kleidicv_remap_s16_u8, remap_s16, uint8_t); -KLEIDICV_DEFINE_C_API(kleidicv_remap_s16point5_u8, remap_s16point5, uint8_t); +KLEIDICV_MULTIVERSION_C_API(kleidicv_remap_s16point5_u8, + &kleidicv::neon::remap_s16point5, nullptr, + nullptr); diff --git a/kleidicv/src/remap/remap_sc.h b/kleidicv/src/remap/remap_sc.h new file mode 100644 index 0000000000000000000000000000000000000000..837090093d4d297eb8611db6401b15fd320917a9 --- /dev/null +++ b/kleidicv/src/remap/remap_sc.h @@ -0,0 +1,143 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_REMAP_SC_H +#define KLEIDICV_REMAP_SC_H + +#include + +#include +#include +#include + +#include "kleidicv/kleidicv.h" +#include "kleidicv/remap/remap.h" +#include "kleidicv/sve2.h" + +namespace KLEIDICV_TARGET_NAMESPACE { + +template +class RemapS16; + +template <> +class RemapS16 { + public: + using ScalarType = uint8_t; + using MapVecTraits = VecTraits; + using MapVectorType = typename MapVecTraits::VectorType; + using MapVector2Type = typename MapVecTraits::Vector2Type; + + RemapS16(Rows src_rows, svuint16_t& v_src_stride, + MapVectorType& v_x_max, + MapVectorType& v_y_max) KLEIDICV_STREAMING_COMPATIBLE + : src_rows_{src_rows}, + v_src_stride_{v_src_stride}, + v_xmax_{v_x_max}, + v_ymax_{v_y_max} {} + + void process_row(size_t width, Columns mapxy, + Columns dst) KLEIDICV_STREAMING_COMPATIBLE { + svuint32_t offsets_b, offsets_t; + svint16_t svzero = svdup_n_s16(0); + auto load_offsets = [&](svbool_t pg) KLEIDICV_STREAMING_COMPATIBLE { + MapVector2Type xy = svld2_s16(pg, &mapxy[0]); + // Clamp coordinates to within the dimensions of the source image + svuint16_t x = svreinterpret_u16_s16( + svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 0), v_xmax_))); + svuint16_t y = svreinterpret_u16_s16( + svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_))); + // Calculate offsets from coordinates (y * stride + x) + offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_stride_); + offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_stride_); + }; + + svbool_t pg_all = MapVecTraits::svptrue(); + // always get element #0 + svbool_t pg_0 = svpfalse(); + + auto generic_vector_path = + [&](svbool_t pg, ptrdiff_t step) KLEIDICV_STREAMING_COMPATIBLE { + load_offsets(pg); + // Copy pixels from source + for (ptrdiff_t i = 0; i < step / 2; ++i) { + dst[i * 2] = src_rows_[svlasta_u32(pg_0, offsets_b)]; + offsets_b = svext(offsets_b, offsets_b, 1); + dst[i * 2 + 1] = src_rows_[svlasta_u32(pg_0, offsets_t)]; + offsets_t = svext(offsets_t, offsets_t, 1); + } + if (step % 2) { + dst[step - 1] = src_rows_[svlasta_u32(pg_0, offsets_b)]; + } + mapxy += step; + dst += step; + }; + + // NOTE: gather load is not available in streaming mode + auto gather_load_full_vector_path = + [&](svbool_t pg, ptrdiff_t step) KLEIDICV_STREAMING_COMPATIBLE { + load_offsets(pg); + // Copy pixels from source + svuint32_t result_b = + svldnt1ub_gather_u32offset_u32(pg, &src_rows_[0], offsets_b); + svuint32_t result_t = + svldnt1ub_gather_u32offset_u32(pg, &src_rows_[0], offsets_t); + svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), + svreinterpret_u16_u32(result_t)); + svst1b_u16(pg, &dst[0], result); + mapxy += step; + dst += step; + }; + + LoopUnroll loop{width, MapVecTraits::num_lanes()}; + loop.unroll_once([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + gather_load_full_vector_path(pg_all, static_cast(step)); + }); + loop.remaining( + [&](size_t length, size_t step) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = MapVecTraits::svwhilelt(step - length, step); + generic_vector_path(pg, static_cast(length)); + }); + } + + private: + Rows src_rows_; + svuint16_t& v_src_stride_; + MapVectorType& v_xmax_; + MapVectorType& v_ymax_; +}; // end of class RemapS16 + +template +kleidicv_error_t remap_s16_sc( + const T* src, size_t src_stride, size_t src_width, size_t src_height, + T* dst, size_t dst_stride, size_t dst_width, size_t dst_height, + size_t channels, const int16_t* mapxy, size_t mapxy_stride, + kleidicv_border_type_t border_type, + kleidicv_border_values_t) KLEIDICV_STREAMING_COMPATIBLE { + CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); + CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); + CHECK_IMAGE_SIZE(src_width, src_height); + CHECK_IMAGE_SIZE(dst_width, dst_height); + + if (!remap_s16_is_implemented(dst_width, border_type, channels)) { + return KLEIDICV_ERROR_NOT_IMPLEMENTED; + } + + Rows src_rows{src, src_stride, channels}; + Rows mapxy_rows{mapxy, mapxy_stride, 2}; + Rows dst_rows{dst, dst_stride, channels}; + svuint16_t sv_src_stride = svdup_u16(src_rows.stride()); + svint16_t sv_xmax = svdup_s16(static_cast( + std::min(std::numeric_limits::max(), src_width - 1))); + svint16_t sv_ymax = svdup_s16(static_cast( + std::min(std::numeric_limits::max(), src_height - 1))); + RemapS16 operation{src_rows, sv_src_stride, sv_xmax, sv_ymax}; + Rectangle rect{dst_width, dst_height}; + zip_rows(operation, rect, mapxy_rows, dst_rows); + return KLEIDICV_OK; +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_REMAP_SC_H diff --git a/kleidicv/src/remap/remap_sve2.cpp b/kleidicv/src/remap/remap_sve2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9515a010a5754bfa11c8979352eda7e96d9a42fd --- /dev/null +++ b/kleidicv/src/remap/remap_sve2.cpp @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "remap_sc.h" + +namespace kleidicv::sve2 { + +template +kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, + size_t src_height, T *dst, size_t dst_stride, + size_t dst_width, size_t dst_height, size_t channels, + const int16_t *mapxy, size_t mapxy_stride, + kleidicv_border_type_t border_type, + kleidicv_border_values_t border_values) { + return remap_s16_sc(src, src_stride, src_width, src_height, dst, + dst_stride, dst_width, dst_height, channels, + mapxy, mapxy_stride, border_type, border_values); +} + +#define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16( \ + const type *src, size_t src_stride, size_t src_width, size_t src_height, \ + type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ + size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ + kleidicv_border_type_t border_type, \ + kleidicv_border_values_t border_values) + +KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t); + +} // namespace kleidicv::sve2