diff --git a/kleidicv/src/conversions/float_conv_sc.h b/kleidicv/src/conversions/float_conv_sc.h index 9e19b24164a48127367f3f345b9db87ec531472e..d62bec2fd2bf534be7ff5b22ce9dac445511c4fc 100644 --- a/kleidicv/src/conversions/float_conv_sc.h +++ b/kleidicv/src/conversions/float_conv_sc.h @@ -24,20 +24,36 @@ class float_conversion_operation { using IntermediateVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits< std::conditional_t, int32_t, uint32_t>>; using IntermediateVectorType = typename IntermediateVecTraits::VectorType; + using DstVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; + using DstVectorType = typename DstVecTraits::VectorType; + + explicit float_conversion_operation(svuint8_t& index) + KLEIDICV_STREAMING_COMPATIBLE : index_(index) { + // Index generation to reorder converted values by tbl instruction + auto index0 = svindex_u8(0, 4); + auto index1 = svindex_u8(1, 4); + auto index2 = svindex_u8(2, 4); + auto index3 = svindex_u8(3, 4); + + svbool_t pg = svwhilelt_b8(uint64_t(0), svcntb() / 4); + + index_ = svsplice(pg, index3, svdup_u8(0)); + index_ = svsplice(pg, index2, index_); + index_ = svsplice(pg, index1, index_); + index_ = svsplice(pg, index0, index_); + } void process_row(size_t width, Columns src, Columns dst) KLEIDICV_STREAMING_COMPATIBLE { LoopUnroll{width, SrcVecTraits::num_lanes()} - .unroll_twice([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg = SrcVecTraits::svptrue(); - SrcVectorType src_vector1 = svld1(pg, &src[0]); - SrcVectorType src_vector2 = svld1_vnum(pg, &src[0], 1); - IntermediateVectorType result_vector1 = - vector_path(pg, src_vector1); - IntermediateVectorType result_vector2 = - vector_path(pg, src_vector2); - svst1b(pg, &dst[0], result_vector1); - svst1b_vnum(pg, &dst[0], 1, result_vector2); + .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg = DstVecTraits::svptrue(); + SrcVectorType src_v0 = svld1(pg, &src[0]); + SrcVectorType src_v1 = svld1_vnum(pg, &src[0], 1); + SrcVectorType src_v2 = svld1_vnum(pg, &src[0], 2); + SrcVectorType src_v3 = svld1_vnum(pg, &src[0], 3); + DstVectorType res0 = vector_path(pg, src_v0, src_v1, src_v2, src_v3); + svst1(pg, &dst[0], res0); src += ptrdiff_t(step); dst += ptrdiff_t(step); }) @@ -47,7 +63,7 @@ class float_conversion_operation { while (svptest_first(SrcVecTraits::svptrue(), pg)) { SrcVectorType src_vector = svld1(pg, &src[ptrdiff_t(index)]); IntermediateVectorType result_vector = - vector_path(pg, src_vector); + remaining_path(pg, src_vector); svst1b(pg, &dst[ptrdiff_t(index)], result_vector); // Update loop counter and calculate the next governing predicate. index += SrcVecTraits::num_lanes(); @@ -60,7 +76,48 @@ class float_conversion_operation { template < typename O, std::enable_if_t && std::is_signed_v, int> = 0> - IntermediateVectorType vector_path(svbool_t& pg, SrcVectorType src) + decltype(auto) convert(svbool_t full_pg, + SrcVectorType in) KLEIDICV_STREAMING_COMPATIBLE { + return svcvt_s32_f32_x(full_pg, in); + } + + template < + typename O, + std::enable_if_t && !std::is_signed_v, int> = 0> + decltype(auto) convert(svbool_t full_pg, + SrcVectorType in) KLEIDICV_STREAMING_COMPATIBLE { + return svcvt_u32_f32_x(full_pg, in); + } + + DstVectorType vector_path(svbool_t full_pg, SrcVectorType fsrc0, + SrcVectorType fsrc1, SrcVectorType fsrc2, + SrcVectorType fsrc3) KLEIDICV_STREAMING_COMPATIBLE { + fsrc0 = svrinti_f32_x(full_pg, fsrc0); + fsrc1 = svrinti_f32_x(full_pg, fsrc1); + fsrc2 = svrinti_f32_x(full_pg, fsrc2); + fsrc3 = svrinti_f32_x(full_pg, fsrc3); + + auto _32bit_res0 = convert(full_pg, fsrc0); + auto _32bit_res1 = convert(full_pg, fsrc1); + auto _32bit_res2 = convert(full_pg, fsrc2); + auto _32bit_res3 = convert(full_pg, fsrc3); + + auto _16bit_res0 = svqxtnb(_32bit_res0); + _16bit_res0 = svqxtnt(_16bit_res0, _32bit_res2); + + auto _16bit_res1 = svqxtnb(_32bit_res1); + _16bit_res1 = svqxtnt(_16bit_res1, _32bit_res3); + + auto _8bit_res = svqxtnb(_16bit_res0); + _8bit_res = svqxtnt(_8bit_res, _16bit_res1); + + return svtbl(_8bit_res, index_); + } + + template < + typename O, + std::enable_if_t && std::is_signed_v, int> = 0> + IntermediateVectorType remaining_path(svbool_t& pg, SrcVectorType src) KLEIDICV_STREAMING_COMPATIBLE { constexpr float min_val = std::numeric_limits::min(); constexpr float max_val = std::numeric_limits::max(); @@ -79,7 +136,7 @@ class float_conversion_operation { template < typename O, std::enable_if_t && !std::is_signed_v, int> = 0> - IntermediateVectorType vector_path(svbool_t& pg, SrcVectorType src) + IntermediateVectorType remaining_path(svbool_t& pg, SrcVectorType src) KLEIDICV_STREAMING_COMPATIBLE { constexpr float max_val = std::numeric_limits::max(); @@ -90,6 +147,8 @@ class float_conversion_operation { return svcvt_u32_f32_x(pg, src); } + + svuint8_t& index_; }; // end of class float_conversion_operation template @@ -97,6 +156,9 @@ class float_conversion_operation { public: using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; using VectorType = typename VecTraits::VectorType; + + explicit float_conversion_operation(svuint8_t&) {} + void process_row(size_t width, Columns src, Columns dst) KLEIDICV_STREAMING_COMPATIBLE { LoopUnroll{width, VecTraits::num_lanes()} @@ -166,7 +228,8 @@ static kleidicv_error_t float_conversion_sc( CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); - float_conversion_operation operation; + svuint8_t index; + float_conversion_operation operation{index}; Rectangle rect{width, height}; Rows src_rows{src, src_stride}; Rows dst_rows{dst, dst_stride}; diff --git a/scripts/benchmark/run_benchmarks_4K.sh b/scripts/benchmark/run_benchmarks_4K.sh index 99a5ef055dd374ca780cfaf8c1549b7f8ef5b7e0..5800d0d52bad142c6eb403d1003c8adf2eeab83d 100755 --- a/scripts/benchmark/run_benchmarks_4K.sh +++ b/scripts/benchmark/run_benchmarks_4K.sh @@ -73,4 +73,9 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_S32_4K opencv_perf_core '*minMaxVals/*' '(3840x2160, 32SC1)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_F32_4K opencv_perf_core '*minMaxVals/*' '(3840x2160, 32FC1)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToInt opencv_perf_core '*convertTo/*' '(3840x2160, 32FC1, 8SC1, 1, 1, 0)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToUint opencv_perf_core '*convertTo/*' '(3840x2160, 32FC1, 8UC1, 1, 1, 0)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL IntToFloat opencv_perf_core '*convertTo/*' '(3840x2160, 8SC1, 32FC1, 1, 1, 0)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL UintToFloat opencv_perf_core '*convertTo/*' '(3840x2160, 8UC1, 32FC1, 1, 1, 0)')") + echo "$RES" diff --git a/scripts/benchmark/run_benchmarks_FHD.sh b/scripts/benchmark/run_benchmarks_FHD.sh index f3c01cb935b098481905f8b85f63ff6403b4252f..ed6edc7f64d69d174986fc0a18c7121f93074f33 100755 --- a/scripts/benchmark/run_benchmarks_FHD.sh +++ b/scripts/benchmark/run_benchmarks_FHD.sh @@ -73,4 +73,10 @@ RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_S32_FHD opencv_perf_core '*minMaxVals/*' '(1920x1080, 32SC1)')") RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL MinMax_F32_FHD opencv_perf_core '*minMaxVals/*' '(1920x1080, 32FC1)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToInt opencv_perf_core '*convertTo/*' '(1920x1080, 32FC1, 8SC1, 1, 1, 0)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL FloatToUint opencv_perf_core '*convertTo/*' '(1920x1080, 32FC1, 8UC1, 1, 1, 0)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL IntToFloat opencv_perf_core '*convertTo/*' '(1920x1080, 8SC1, 32FC1, 1, 1, 0)')") +RES+=$(printf "\n$(${DEV_DIR}/perf_test_op.sh $CUSTOM_BUILD_SUFFIX $CPU $THERMAL UintToFloat opencv_perf_core '*convertTo/*' '(1920x1080, 8UC1, 32FC1, 1, 1, 0)')") + + echo "$RES"