diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index 57d4a6de3044baeee1d910027dea5b46a6b591ae..e4772357e20909e97d946be93649e91e49081c1a 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -177,3 +177,22 @@ static void gaussian_blur_7x7_u8_3ch(benchmark::State& state) { gaussian_blur(kleidicv_gaussian_blur_7x7_u8, 3, state); } BENCHMARK(gaussian_blur_7x7_u8_3ch); + +static void exp_f32(benchmark::State& state) { + // Setup + std::vector src, dst; + src.resize(image_width * image_height); + dst.resize(image_width * image_height); + + std::mt19937 generator; + std::generate(src.begin(), src.end(), generator); + + for (auto _ : state) { + // This code gets benchmarked + auto unused = kleidicv_exp_f32(src.data(), image_width * sizeof(float), + dst.data(), image_width * sizeof(float), + image_width, image_height); + (void)unused; + } +} +BENCHMARK(exp_f32); diff --git a/kleidicv/include/kleidicv/arithmetics/exp_constants.h b/kleidicv/include/kleidicv/arithmetics/exp_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..10d2a61169829254c96ddaff19bcf5c39b099c81 --- /dev/null +++ b/kleidicv/include/kleidicv/arithmetics/exp_constants.h @@ -0,0 +1,22 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_EXP_CONSTANTS_H +#define KLEIDICV_EXP_CONSTANTS_H + +namespace kleidicv::exp_f32 { + +constexpr float kShift = 0x1.8p23F; +constexpr float kInvLn2 = 0x1.715476p+0F; +constexpr float kLn2Hi = 0x1.62e4p-1F; +constexpr float kLn2Lo = 0x1.7f7d1cp-20F; +constexpr float kPoly[] = { + /* maxerr: 0.36565 +0.5 ulp. */ + 0x1.6a6000p-10F, 0x1.12718ep-7F, 0x1.555af0p-5F, + 0x1.555430p-3F, 0x1.fffff4p-2F, +}; + +} // namespace kleidicv::exp_f32 + +#endif // KLEIDICV_EXP_CONSTANTS_H diff --git a/kleidicv/src/arithmetics/exp_api.cpp b/kleidicv/src/arithmetics/exp_api.cpp index 39c492719c72a09e4b958b5007f59e945c0821e2..d557b09941a950cba7a23e2f136e9b595f23c080 100644 --- a/kleidicv/src/arithmetics/exp_api.cpp +++ b/kleidicv/src/arithmetics/exp_api.cpp @@ -16,10 +16,27 @@ kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride, } // namespace neon +namespace sve2 { + +template +kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride, + size_t width, size_t height); + +} // namespace sve2 + +namespace sme2 { + +template +kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride, + size_t width, size_t height); + +} // namespace sme2 + } // namespace kleidicv -#define KLEIDICV_DEFINE_C_API(name, type) \ - KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::exp, nullptr, \ - nullptr) +#define KLEIDICV_DEFINE_C_API(name, type) \ + KLEIDICV_MULTIVERSION_C_API(name, &kleidicv::neon::exp, \ + &kleidicv::sve2::exp, \ + &kleidicv::sme2::exp) KLEIDICV_DEFINE_C_API(kleidicv_exp_f32, float); diff --git a/kleidicv/src/arithmetics/exp_neon.cpp b/kleidicv/src/arithmetics/exp_neon.cpp index 23fe62339e8d1e4127e11902f2911fecdb6026f5..274dc5ee3344b83cd6c2ffc47997730fc1e9343d 100644 --- a/kleidicv/src/arithmetics/exp_neon.cpp +++ b/kleidicv/src/arithmetics/exp_neon.cpp @@ -4,6 +4,7 @@ #include +#include "kleidicv/arithmetics/exp_constants.h" #include "kleidicv/kleidicv.h" #include "kleidicv/neon.h" @@ -19,27 +20,26 @@ class Exp final : public UnrollOnce { using VectorType = typename VecTraits::VectorType; VectorType vector_path(VectorType src) { - float32x4_t n, r, scale, poly, absn, z; + float32x4_t n, r, scale, poly, z; uint32x4_t cmp, e; /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - z = vfmaq_f32(vdupq_n(kShift), src, vdupq_n(kInvLn2)); - n = z - vdupq_n(kShift); - r = vfmaq_f32(src, n, vdupq_n(-kLn2Hi)); - r = vfmaq_f32(r, n, vdupq_n(-kLn2Lo)); + z = vfmaq_f32(vdupq_n(exp_f32::kShift), src, vdupq_n(exp_f32::kInvLn2)); + n = z - vdupq_n(exp_f32::kShift); + r = vfmaq_f32(src, n, vdupq_n(-exp_f32::kLn2Hi)); + r = vfmaq_f32(r, n, vdupq_n(-exp_f32::kLn2Lo)); e = vreinterpretq_u32_f32(z) << 23; scale = vreinterpretq_f32_u32(e + vdupq_n(0x3f800000)); - absn = vabsq_f32(n); - cmp = absn > vdupq_n(126.0F); - poly = vfmaq_f32(vdupq_n(kPoly[1]), vdupq_n(kPoly[0]), r); - poly = vfmaq_f32(vdupq_n(kPoly[2]), poly, r); - poly = vfmaq_f32(vdupq_n(kPoly[3]), poly, r); - poly = vfmaq_f32(vdupq_n(kPoly[4]), poly, r); + cmp = vcagtq_f32(n, vdupq_n(126.0F)); + poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[1]), vdupq_n(exp_f32::kPoly[0]), r); + poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[2]), poly, r); + poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[3]), poly, r); + poly = vfmaq_f32(vdupq_n(exp_f32::kPoly[4]), poly, r); poly = vfmaq_f32(vdupq_n(1.0F), poly, r); poly = vfmaq_f32(vdupq_n(1.0F), poly, r); if (KLEIDICV_UNLIKELY(v_any_u32(cmp))) { - return specialcase(poly, n, e, absn); + return specialcase(poly, n, e); } return scale * poly; } @@ -52,33 +52,24 @@ class Exp final : public UnrollOnce { return vpaddd_u64(vreinterpretq_u64_u32(x)) != 0; } - static float32x4_t specialcase(float32x4_t poly, float32x4_t n, uint32x4_t e, - float32x4_t absn) { + static float32x4_t specialcase(float32x4_t poly, float32x4_t n, + uint32x4_t e) { /* 2^n may overflow, break it up into s1*s2. */ uint32x4_t b = (n <= vdupq_n(0.0F)) & vdupq_n(0x83000000); float32x4_t s1 = vreinterpretq_f32_u32(vdupq_n(0x7f000000) + b); float32x4_t s2 = vreinterpretq_f32_u32(e - b); - uint32x4_t cmp = absn > vdupq_n(192.0F); + uint32x4_t cmp = vcagtq_f32(n, vdupq_n(192.0F)); float32x4_t r1 = s1 * s1; float32x4_t r0 = poly * s1 * s2; return vreinterpretq_f32_u32((cmp & vreinterpretq_u32_f32(r1)) | (~cmp & vreinterpretq_u32_f32(r0))); } - - static constexpr float kShift = 0x1.8p23F; - static constexpr float kInvLn2 = 0x1.715476p+0F; - static constexpr float kLn2Hi = 0x1.62e4p-1F; - static constexpr float kLn2Lo = 0x1.7f7d1cp-20F; - static constexpr float kPoly[] = { - /* maxerr: 0.36565 +0.5 ulp. */ - 0x1.6a6000p-10F, 0x1.12718ep-7F, 0x1.555af0p-5F, - 0x1.555430p-3F, 0x1.fffff4p-2F, - }; }; // end of class Exp template -kleidicv_error_t exp(const T* src, size_t src_stride, T* dst, size_t dst_stride, - size_t width, size_t height) { +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp(const T* src, size_t src_stride, + T* dst, size_t dst_stride, + size_t width, size_t height) { CHECK_POINTER_AND_STRIDE(src, src_stride, height); CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); CHECK_IMAGE_SIZE(width, height); diff --git a/kleidicv/src/arithmetics/exp_sc.h b/kleidicv/src/arithmetics/exp_sc.h new file mode 100644 index 0000000000000000000000000000000000000000..fea6b6ad9921bacb27ed10034e7ff8c30f51f976 --- /dev/null +++ b/kleidicv/src/arithmetics/exp_sc.h @@ -0,0 +1,97 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef KLEIDICV_EXP_SC_H +#define KLEIDICV_EXP_SC_H + +#include "kleidicv/arithmetics/exp_constants.h" +#include "kleidicv/kleidicv.h" +#include "kleidicv/sve2.h" + +namespace KLEIDICV_TARGET_NAMESPACE { +template +class Exp; + +template +class Exp final : public UnrollOnce { + public: + using ContextType = Context; + using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits; + using VectorType = typename VecTraits::VectorType; + + VectorType vector_path(ContextType ctx, + VectorType src) KLEIDICV_STREAMING_COMPATIBLE { + svfloat32_t n, r, poly, z; + svuint32_t e; + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kShift), src, + exp_f32::kInvLn2); + n = svsub_x(ctx.predicate(), z, exp_f32::kShift); + r = svmla_x(ctx.predicate(), src, n, -exp_f32::kLn2Hi); + r = svmla_x(ctx.predicate(), r, n, -exp_f32::kLn2Lo); + e = svlsl_x(ctx.predicate(), svreinterpret_u32(z), 23); + poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[1]), + svdup_f32(exp_f32::kPoly[0]), r); + poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[2]), poly, r); + poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[3]), poly, r); + poly = svmla_x(ctx.predicate(), svdup_f32(exp_f32::kPoly[4]), poly, r); + poly = svmla_x(ctx.predicate(), svdup_f32(1.0F), poly, r); + poly = svmla_x(ctx.predicate(), svdup_f32(1.0F), poly, r); + + if constexpr (TryShortPath) { + svbool_t cmp = svacgt(ctx.predicate(), n, 126.0F); + if (KLEIDICV_UNLIKELY(svptest_any(ctx.predicate(), cmp))) { + return specialcase(ctx.predicate(), poly, n, e); + } + svfloat32_t scale = + svreinterpret_f32(svadd_x(ctx.predicate(), e, 0x3f800000U)); + return svmul_x(ctx.predicate(), scale, poly); + } + + return specialcase(ctx.predicate(), poly, n, e); + } + + private: + static svfloat32_t specialcase(svbool_t pg, svfloat32_t poly, svfloat32_t n, + svuint32_t e) KLEIDICV_STREAMING_COMPATIBLE { + /* 2^n may overflow, break it up into s1*s2. */ + svuint32_t b = svsel(svcmple(pg, n, svdup_f32(0.0F)), + svdup_u32(0x83000000U), svdup_u32(0.0F)); + svfloat32_t s1 = svreinterpret_f32(svadd_x(pg, b, 0x7f000000U)); + svfloat32_t s2 = svreinterpret_f32(svsub_x(pg, e, b)); + svbool_t cmp = svacgt(pg, n, 192.0F); + svfloat32_t r1 = svmul_x(pg, s1, s1); + svfloat32_t r0 = svmul_x(pg, poly, svmul_x(pg, s1, s2)); + + return svsel(cmp, r1, r0); + } +}; // end of class Exp + +template +using ExpNoShortPath = Exp; + +template +using ExpTryShortPath = Exp; + +template +static kleidicv_error_t exp_sc(const T* src, size_t src_stride, T* dst, + size_t dst_stride, size_t width, + size_t height) KLEIDICV_STREAMING_COMPATIBLE { + CHECK_POINTER_AND_STRIDE(src, src_stride, height); + CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); + CHECK_IMAGE_SIZE(width, height); + + Operation operation; + Rectangle rect{width, height}; + Rows src_rows{src, src_stride}; + Rows dst_rows{dst, dst_stride}; + apply_operation_by_rows(operation, rect, src_rows, dst_rows); + return KLEIDICV_OK; +} + +} // namespace KLEIDICV_TARGET_NAMESPACE + +#endif // KLEIDICV_EXP_SC_H diff --git a/kleidicv/src/arithmetics/exp_sme2.cpp b/kleidicv/src/arithmetics/exp_sme2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2f2b44dcb2633f25e656083d51081ae613235ab6 --- /dev/null +++ b/kleidicv/src/arithmetics/exp_sme2.cpp @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "exp_sc.h" + +namespace kleidicv::sme2 { + +template +KLEIDICV_LOCALLY_STREAMING KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t +exp(const T* src, size_t src_stride, T* dst, size_t dst_stride, size_t width, + size_t height) { + return exp_sc>(src, src_stride, dst, dst_stride, width, + height); +} + +#define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp( \ + const type* src, size_t src_stride, type* dst, size_t dst_stride, \ + size_t width, size_t height) + +KLEIDICV_INSTANTIATE_TEMPLATE(float); + +} // namespace kleidicv::sme2 diff --git a/kleidicv/src/arithmetics/exp_sve2.cpp b/kleidicv/src/arithmetics/exp_sve2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..78a013e4d6b66e107d809f1a29cccf52c7d443bc --- /dev/null +++ b/kleidicv/src/arithmetics/exp_sve2.cpp @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 + +#include "exp_sc.h" + +namespace kleidicv::sve2 { + +template +KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp(const T* src, size_t src_stride, + T* dst, size_t dst_stride, + size_t width, size_t height) { + return exp_sc>(src, src_stride, dst, dst_stride, width, + height); +} + +#define KLEIDICV_INSTANTIATE_TEMPLATE(type) \ + template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t exp( \ + const type* src, size_t src_stride, type* dst, size_t dst_stride, \ + size_t width, size_t height) + +KLEIDICV_INSTANTIATE_TEMPLATE(float); + +} // namespace kleidicv::sve2 diff --git a/test/api/test_exp.cpp b/test/api/test_exp.cpp index 9347b21c10716c7e8729fbbab66ac6f92b42cb60..f4f7b2af139884d205c9114a83b56a667a60e28b 100644 --- a/test/api/test_exp.cpp +++ b/test/api/test_exp.cpp @@ -237,3 +237,17 @@ TYPED_TEST(Exp, NullPointers) { test::test_null_args(exp(), src, sizeof(TypeParam), dst, sizeof(TypeParam), 1, 1); } + +TYPED_TEST(Exp, Misalignment) { + if (sizeof(TypeParam) == 1) { + // misalignment impossible + return; + } + TypeParam src[2] = {}, dst[2]; + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + exp()(src, sizeof(TypeParam) + 1, dst, sizeof(TypeParam), + 1, 2)); + EXPECT_EQ(KLEIDICV_ERROR_ALIGNMENT, + exp()(src, sizeof(TypeParam), dst, sizeof(TypeParam) + 1, + 1, 2)); +}