From c4e78498fd512d5ac7c6f972cde078dd2a5a33fd Mon Sep 17 00:00:00 2001 From: Pierre Blanchard Date: Wed, 19 Feb 2025 13:57:03 +0000 Subject: [PATCH 1/3] Add hyperbolic functions to benchmarks --- src/libm-benchmarks/benchlibm.cpp | 9 +++++++++ src/libm-benchmarks/benchsleef.cpp | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/src/libm-benchmarks/benchlibm.cpp b/src/libm-benchmarks/benchlibm.cpp index 2d29d95..f7aeb61 100644 --- a/src/libm-benchmarks/benchlibm.cpp +++ b/src/libm-benchmarks/benchlibm.cpp @@ -66,4 +66,13 @@ BENCH_DOUBLE_SCALAR(expm1, -700, 700); // pow BENCH_SCALAR_2ARGS(pow, -30, 30); +// ======================HYPERBOLIC======================== +// cosh +BENCH_SINGLE_SCALAR(cosh, -100, 100); +BENCH_DOUBLE_SCALAR(cosh, -700, 700); + +// sinh +BENCH_SINGLE_SCALAR(sinh, -100, 100); +BENCH_DOUBLE_SCALAR(sinh, -700, 700); + BENCHMARK_MAIN(); \ No newline at end of file diff --git a/src/libm-benchmarks/benchsleef.cpp b/src/libm-benchmarks/benchsleef.cpp index 8c7ae8a..eb9f364 100644 --- a/src/libm-benchmarks/benchsleef.cpp +++ b/src/libm-benchmarks/benchsleef.cpp @@ -67,4 +67,12 @@ BENCH_DOUBLEP_W_FIX_ULP(expm1, u10, -700, 700); // pow BENCH_ALL_W_FIX_ULP(pow, u10, -30, 30); +// ======================HYPERBOLIC======================== +// cosh +BENCH_ALL_SINGLEP(cosh, -100, 100); +BENCH_ALL_DOUBLEP(cosh, -700, 700); +// sinh +BENCH_ALL_SINGLEP(sinh, -100, 100); +BENCH_ALL_DOUBLEP(sinh, -700, 700); + BENCHMARK_MAIN(); \ No newline at end of file -- GitLab From 8e7d4b4ea6f89842c70626f0be54c4eddd372867 Mon Sep 17 00:00:00 2001 From: Pierre Blanchard Date: Mon, 17 Feb 2025 12:07:48 +0000 Subject: [PATCH 2/3] Fix coshf and sinhf early overflow Change test to reveal early overflow in sinhf/coshf, and fix it. A better approach would delay the overflow in expk2f rather than using a different formula, but would it preserve accuracy? Decide whether to inline special case depending on performance. Could increase register pressure but should also be able to re-use expk2f coefficients. Current approach shows performance improvement despite additional logic. --- src/common/keywords.txt | 1 + src/common/misc.h | 6 +++++- src/libm-tester/tester.c | 4 ++-- src/libm/sleefsimdsp.c | 36 ++++++++++++++++++++++++++++++++---- src/libm/sleefsp.c | 18 ++++++++++++++++-- 5 files changed, 56 insertions(+), 9 deletions(-) diff --git a/src/common/keywords.txt b/src/common/keywords.txt index 816919c..daaa2dc 100644 --- a/src/common/keywords.txt +++ b/src/common/keywords.txt @@ -143,6 +143,7 @@ tdxsetx_tdx_tdx_vd tdxsetxyz_tdx_tdx_vd_vd_vd tdxsety_tdx_tdx_vd tdxsetz_tdx_tdx_vd +trighf_special vabs_vd_vd vabs_vf2_vf2 vabs_vf_vf diff --git a/src/common/misc.h b/src/common/misc.h index 2f21411..53902ec 100644 --- a/src/common/misc.h +++ b/src/common/misc.h @@ -150,8 +150,12 @@ // Other bounds +// - cosh(f)(x) and sinh(f)(x) approximation holds up to x equals +#define TRIGHF_BOUND 0x1.6p+6f /* 88.0 */ +#define ATRIGHF_FLT_MAX 0x1.65a9f8p+6 /* 89.41599 */ + // - log1p(f)(x) approximation holds up to x equals -#define LOG1PF_BOUND 0x1.2ced32p+126 /* 1.0e+38 */ +#define LOG1PF_BOUND 0x1.2ced32p+126f /* 1.0e+38 */ #define LOG1P_BOUND 0x1.c7b1f3cac7433p+1019 /* 1.0e+307 */ // diff --git a/src/libm-tester/tester.c b/src/libm-tester/tester.c index 2a5ed00..63f6d8c 100644 --- a/src/libm-tester/tester.c +++ b/src/libm-tester/tester.c @@ -4868,7 +4868,7 @@ void do_test() { fprintf(stderr, "sinhf : "); for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_f(mpfr_sinh, child_sinhf, d, 1.0); if (!enableFlushToZero) { - for(d = -88;d < 88 && success;d += 0.2) checkAccuracy_f(mpfr_sinh, child_sinhf, d, 1.0); + for(d = -TRIGHF_BOUND-4;d < TRIGHF_BOUND+4 && success;d += 0.2) checkAccuracy_f(mpfr_sinh, child_sinhf, d, 1.0); } showResult(success); @@ -4877,7 +4877,7 @@ void do_test() { fprintf(stderr, "coshf : "); for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_f(mpfr_cosh, child_coshf, d, 1.0); if (!enableFlushToZero) { - for(d = -88;d < 88 && success;d += 0.2) checkAccuracy_f(mpfr_cosh, child_coshf, d, 1.0); + for(d = -TRIGHF_BOUND-4;d < TRIGHF_BOUND+4 && success;d += 0.2) checkAccuracy_f(mpfr_cosh, child_coshf, d, 1.0); } showResult(success); diff --git a/src/libm/sleefsimdsp.c b/src/libm/sleefsimdsp.c index c66645e..81b38d5 100644 --- a/src/libm/sleefsimdsp.c +++ b/src/libm/sleefsimdsp.c @@ -2475,14 +2475,32 @@ static INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) FUNC_ATTR { } #if !defined(DETERMINISTIC) +static CONST VECTOR_CC vfloat trighf_special(vfloat d) FUNC_ATTR { + vfloat2 dt = expk2f(vcast_vf2_vf_vf(vmul_vf_vf_vf(vabs_vf_vf(d),vcast_vf_f(0.5f)), vcast_vf_f(0))); + vfloat2 dy = dfscale_vf2_vf2_vf(dt, vcast_vf_f(0.5f)); + dy = dfmul_vf2_vf2_vf2(dy, dt); + return vadd_vf_vf_vf(vf2getx_vf_vf2(dy), vf2gety_vf_vf2(dy)); +} + EXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) FUNC_ATTR { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5)); - y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), - visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); + vopmask oflow = vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(ATRIGHF_FLT_MAX)); + y = vsel_vf_vo_vf_vf(oflow, vcast_vf_f(SLEEF_INFINITYf), y); + + // Fallback to special case subroutine for inputs between ATRIGHF_BOUND and ATRIGHF_FLT_MAX. + vopmask ocore = vle_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(TRIGHF_BOUND)); + vopmask o = vor_vo_vo_vo(ocore, oflow); + +#if defined(ENABLE_SVESTREAM) + y = vsel_vf_vo_vf_vf(o, y, trighf_special(x)); +#else + if(!LIKELY(vtestallones_i_vo32 (o))) y = vsel_vf_vo_vf_vf(o, y, trighf_special(x)); +#endif + y = vsel_vf_vo_vf_vf(visnan_vo_vf(y), vcast_vf_f(SLEEF_INFINITYf), y); y = vmulsign_vf_vf_vf(y, x); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); @@ -2495,8 +2513,18 @@ EXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) FUNC_ATTR { d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5)); - y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), - visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); + vopmask oflow = vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(ATRIGHF_FLT_MAX)); + y = vsel_vf_vo_vf_vf(oflow, vcast_vf_f(SLEEF_INFINITYf), y); + + // Fallback to special case subroutine for inputs between TRIGHF_BOUND and ATRIGHF_FLT_MAX. + vopmask ocore = vle_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(TRIGHF_BOUND)); + vopmask o = vor_vo_vo_vo(ocore, oflow); +#if defined(ENABLE_SVESTREAM) + y = vsel_vf_vo_vf_vf(o, y, trighf_special(x)); +#else + if(!LIKELY(vtestallones_i_vo32 (o))) y = vsel_vf_vo_vf_vf(o, y, trighf_special(x)); +#endif + y = vsel_vf_vo_vf_vf(visnan_vo_vf(y), vcast_vf_f(SLEEF_INFINITYf), y); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; diff --git a/src/libm/sleefsp.c b/src/libm/sleefsp.c index 1df87f4..22d055b 100644 --- a/src/libm/sleefsp.c +++ b/src/libm/sleefsp.c @@ -1398,7 +1398,14 @@ EXPORT CONST float xsinhf(float x) { d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d)); y = (d.x + d.y) * 0.5f; - y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y; + if (fabsfk(x) > ATRIGHF_FLT_MAX) { + y = SLEEF_INFINITYf; + } else if (fabsfk(x) > TRIGHF_BOUND) { + Sleef_float2 dt = expk2f(df(fabsfk(x)/2, 0)); + Sleef_float2 dy = dfscale_f2_f2_f(dt, 0.5f); + dy = dfmul_f2_f2_f2(dy, dt); + y = dy.x + dy.y; + } y = xisnanf(y) ? SLEEF_INFINITYf : y; y = mulsignf(y, x); y = xisnanf(x) ? SLEEF_NANf : y; @@ -1412,7 +1419,14 @@ EXPORT CONST float xcoshf(float x) { d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d)); y = (d.x + d.y) * 0.5f; - y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y; + if (fabsfk(x) > ATRIGHF_FLT_MAX) { + y = SLEEF_INFINITYf; + } else if (fabsfk(x) > TRIGHF_BOUND) { + Sleef_float2 dt = expk2f(df(fabsfk(x)/2, 0)); + Sleef_float2 dy = dfscale_f2_f2_f(dt, 0.5f); + dy = dfmul_f2_f2_f2(dy, dt); + y = dy.x + dy.y; + } y = xisnanf(y) ? SLEEF_INFINITYf : y; y = xisnanf(x) ? SLEEF_NANf : y; -- GitLab From 304f2d355d612ae7f4f0b9271238508af80e2ad0 Mon Sep 17 00:00:00 2001 From: Pierre Blanchard Date: Fri, 7 Mar 2025 16:20:50 +0000 Subject: [PATCH 3/3] Fix cosh and sinh early overflow Similar fix as coshf/sinhf. As expected this incurred a slight performance regression. --- src/common/keywords.txt | 1 + src/common/misc.h | 2 ++ src/libm-tester/tester.c | 4 ++-- src/libm/sleefdp.c | 18 ++++++++++++++++-- src/libm/sleefsimddp.c | 35 +++++++++++++++++++++++++++++++++-- 5 files changed, 54 insertions(+), 6 deletions(-) diff --git a/src/common/keywords.txt b/src/common/keywords.txt index daaa2dc..b979809 100644 --- a/src/common/keywords.txt +++ b/src/common/keywords.txt @@ -143,6 +143,7 @@ tdxsetx_tdx_tdx_vd tdxsetxyz_tdx_tdx_vd_vd_vd tdxsety_tdx_tdx_vd tdxsetz_tdx_tdx_vd +trigh_special trighf_special vabs_vd_vd vabs_vf2_vf2 diff --git a/src/common/misc.h b/src/common/misc.h index 53902ec..761cd38 100644 --- a/src/common/misc.h +++ b/src/common/misc.h @@ -152,7 +152,9 @@ // - cosh(f)(x) and sinh(f)(x) approximation holds up to x equals #define TRIGHF_BOUND 0x1.6p+6f /* 88.0 */ +// - cosh(f)(x) and sinh(f)(x) should overflow for x higher than #define ATRIGHF_FLT_MAX 0x1.65a9f8p+6 /* 89.41599 */ +#define ATRIGH_DBL_MAX 0x1.633ce8fb9f87ep+9 /* 710.4759 */ // - log1p(f)(x) approximation holds up to x equals #define LOG1PF_BOUND 0x1.2ced32p+126f /* 1.0e+38 */ diff --git a/src/libm-tester/tester.c b/src/libm-tester/tester.c index 63f6d8c..2c0fb25 100644 --- a/src/libm-tester/tester.c +++ b/src/libm-tester/tester.c @@ -4021,14 +4021,14 @@ void do_test() { fprintf(stderr, "sinh : "); for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_d(mpfr_sinh, child_sinh, d, 1.0); - for(d = -709;d < 709 && success;d += 0.2) checkAccuracy_d(mpfr_sinh, child_sinh, d, 1.0); + for(d = -LOG_DBL_MAX-4;d < LOG_DBL_MAX+4 && success;d += 0.2) checkAccuracy_d(mpfr_sinh, child_sinh, d, 1.0); showResult(success); // fprintf(stderr, "cosh : "); for(d = -10;d < 10 && success;d += 0.002) checkAccuracy_d(mpfr_cosh, child_cosh, d, 1.0); - for(d = -709;d < 709 && success;d += 0.2) checkAccuracy_d(mpfr_cosh, child_cosh, d, 1.0); + for(d = -LOG_DBL_MAX-4;d < LOG_DBL_MAX+4 && success;d += 0.2) checkAccuracy_d(mpfr_cosh, child_cosh, d, 1.0); showResult(success); // diff --git a/src/libm/sleefdp.c b/src/libm/sleefdp.c index f03fb9a..84f7128 100644 --- a/src/libm/sleefdp.c +++ b/src/libm/sleefdp.c @@ -1690,7 +1690,14 @@ EXPORT CONST double xsinh(double x) { d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d)); y = (d.x + d.y) * 0.5; - y = fabsk(x) > 710 ? SLEEF_INFINITY : y; + if (fabsk(x) > ATRIGH_DBL_MAX) { + y = SLEEF_INFINITY; + } else if (fabsk(x) > LOG_DBL_MAX) { + Sleef_double2 dt = expk2(dd(fabsk(x)/2, 0)); + Sleef_double2 dy = ddscale_d2_d2_d(dt, 0.5); + dy = ddmul_d2_d2_d2(dy, dt); + y = dy.x + dy.y; + } y = xisnan(y) ? SLEEF_INFINITY : y; y = mulsign(y, x); y = xisnan(x) ? SLEEF_NAN : y; @@ -1704,7 +1711,14 @@ EXPORT CONST double xcosh(double x) { d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d)); y = (d.x + d.y) * 0.5; - y = fabsk(x) > 710 ? SLEEF_INFINITY : y; + if (fabsk(x) > ATRIGH_DBL_MAX) { + y = SLEEF_INFINITY; + } else if (fabsk(x) > LOG_DBL_MAX) { + Sleef_double2 dt = expk2(dd(fabsk(x)/2, 0)); + Sleef_double2 dy = ddscale_d2_d2_d(dt, 0.5); + dy = ddmul_d2_d2_d2(dy, dt); + y = dy.x + dy.y; + } y = xisnan(y) ? SLEEF_INFINITY : y; y = xisnan(x) ? SLEEF_NAN : y; diff --git a/src/libm/sleefsimddp.c b/src/libm/sleefsimddp.c index fda80c2..39035ef 100644 --- a/src/libm/sleefsimddp.c +++ b/src/libm/sleefsimddp.c @@ -2435,13 +2435,32 @@ static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) FUNC_ATTR { } #if !defined(DETERMINISTIC) +static CONST VECTOR_CC vdouble trigh_special(vdouble d) FUNC_ATTR { + vdouble2 dt = expk2(vcast_vd2_vd_vd(vmul_vd_vd_vd(vabs_vd_vd(d),vcast_vd_d(0.5)), vcast_vd_d(0))); + vdouble2 dy = ddscale_vd2_vd2_vd(dt, vcast_vd_d(0.5)); + dy = ddmul_vd2_vd2_vd2(dy, dt); + return vadd_vd_vd_vd(vd2getx_vd_vd2(dy), vd2gety_vd_vd2(dy)); +} + EXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) FUNC_ATTR { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5)); - y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); + vopmask oflow = vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(ATRIGH_DBL_MAX)); + y = vsel_vd_vo_vd_vd(oflow, vcast_vd_d(SLEEF_INFINITY), y); + + // Fallback to special case subroutine for inputs between LOG_DBL_MAX and ATRIGH_DBL_MAX. + vopmask ocore = vle_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(LOG_DBL_MAX)); + vopmask o = vor_vo_vo_vo(ocore, oflow); + +#if defined(ENABLE_SVESTREAM) + y = vsel_vd_vo_vd_vd(o, y, trigh_special(x)); +#else + if(!LIKELY(vtestallones_i_vo64 (o))) y = vsel_vd_vo_vd_vd(o, y, trigh_special(x)); +#endif + y = vsel_vd_vo_vd_vd(visnan_vo_vd(y), vcast_vd_d(SLEEF_INFINITY), y); y = vmulsign_vd_vd_vd(y, x); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); @@ -2454,7 +2473,19 @@ EXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) FUNC_ATTR { d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5)); - y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); + vopmask oflow = vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(ATRIGH_DBL_MAX)); + y = vsel_vd_vo_vd_vd(oflow, vcast_vd_d(SLEEF_INFINITY), y); + + // Fallback to special case subroutine for inputs between LOG_DBL_MAX and ATRIGH_DBL_MAX. + vopmask ocore = vle_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(LOG_DBL_MAX)); + vopmask o = vor_vo_vo_vo(ocore, oflow); + +#if defined(ENABLE_SVESTREAM) + y = vsel_vd_vo_vd_vd(o, y, trigh_special(x)); +#else + if(!LIKELY(vtestallones_i_vo64 (o))) y = vsel_vd_vo_vd_vd(o, y, trigh_special(x)); +#endif + y = vsel_vd_vo_vd_vd(visnan_vo_vd(y), vcast_vd_d(SLEEF_INFINITY), y); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; -- GitLab