diff --git a/CMakeLists.txt b/CMakeLists.txt index 12600629a78b9e6db7fc803fd92ef2e6b7b600c3..58e6d786d4a0760b9d7d01539b308efdb5bbc43c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ endfunction() set(SLEEF_ALL_SUPPORTED_EXTENSIONS AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2 # x86 - SVENOFMA SVE ADVSIMDNOFMA ADVSIMD # Aarch64 + SVENOFMA SVE SVESTREAM ADVSIMDNOFMA ADVSIMD # Aarch64 NEON32 NEON32VFPV4 # Aarch32 VSX VSXNOFMA VSX3 VSX3NOFMA # PPC64 VXE VXENOFMA VXE2 VXE2NOFMA # IBM Z @@ -108,7 +108,7 @@ set(SLEEF_ALL_SUPPORTED_EXTENSIONS set(SLEEF_SUPPORTED_LIBM_EXTENSIONS AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2 # x86 - SVENOFMA SVE ADVSIMDNOFMA ADVSIMD # Aarch64 + SVENOFMA SVE SVESTREAM ADVSIMDNOFMA ADVSIMD # Aarch64 NEON32 NEON32VFPV4 # Aarch32 VSX VSXNOFMA VSX3 VSX3NOFMA # PPC64 VXE VXENOFMA VXE2 VXE2NOFMA # IBM Z @@ -116,8 +116,8 @@ set(SLEEF_SUPPORTED_LIBM_EXTENSIONS PUREC_SCALAR PURECFMA_SCALAR # Generic type CACHE STRING "List of SIMD architectures supported by libsleef." ) -set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS - SSE2 AVX AVX2 AVX512F ADVSIMD SVE +set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS + SSE2 AVX AVX2 AVX512F ADVSIMD SVE SVESTREAM CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI." ) diff --git a/Configure.cmake b/Configure.cmake index 7c64ce962601a788236ec8bf0d2a07cfb12bab4c..643fbc102fc4d1a2f6eb8c8fb75c2a2e83519f99 100644 --- a/Configure.cmake +++ b/Configure.cmake @@ -171,6 +171,7 @@ set(CLANG_FLAGS_ENABLE_NEON32VFPV4 "-march=armv7-a;-mfpu=neon-vfpv4") # Arm AArch64 vector extensions. set(CLANG_FLAGS_ENABLE_SVE "-march=armv8-a+sve") set(CLANG_FLAGS_ENABLE_SVENOFMA "-march=armv8-a+sve") +set(CLANG_FLAGS_ENABLE_SVESTREAM "-march=armv9-a+sme") # PPC64 set(CLANG_FLAGS_ENABLE_VSX "-mcpu=power8") set(CLANG_FLAGS_ENABLE_VSXNOFMA "-mcpu=power8") @@ -527,6 +528,9 @@ endif() option(SLEEF_DISABLE_SVE "Disable SVE" OFF) option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF) +option(SLEEF_DISABLE_SVESTREAM "Disable Streaming SVE" OFF) +option(SLEEF_ENFORCE_SVESTREAM "Build fails if Streaming SVE is not supported by the compiler" OFF) + # Darwin does not support SVE yet (see issue #474), # therefore we disable SVE on Darwin systems. if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") @@ -539,6 +543,16 @@ if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQU if(COMPILER_SUPPORTS_SVE) set(COMPILER_SUPPORTS_SVENOFMA 1) + + if (NOT SLEEF_DISABLE_SVESTREAM) + string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SVESTREAM}") + CHECK_C_SOURCE_COMPILES(" + #include + int get_svl(void) __arm_streaming { return svcntw(); } + int main() { + int svl = get_svl(); }" + COMPILER_SUPPORTS_SVESTREAM) + endif() endif() endif() @@ -546,6 +560,12 @@ if (SLEEF_ENFORCE_SVE AND NOT COMPILER_SUPPORTS_SVE) message(FATAL_ERROR "SLEEF_ENFORCE_SVE is specified and that feature is disabled or not supported by the compiler") endif() +if (SLEEF_ENFORCE_SVESTREAM AND NOT COMPILER_SUPPORTS_SVE) + message(FATAL_ERROR "SLEEF_ENFORCE_SVESTREAM is specified but SVE is disabled or not supported by the compiler") +elseif (SLEEF_ENFORCE_SVESTREAM AND NOT COMPILER_SUPPORTS_SVESTREAM) + message(FATAL_ERROR "SLEEF_ENFORCE_SVESTREAM is specified and that feature is disabled or not supported by the compiler") +endif() + # VSX option(SLEEF_DISABLE_VSX "Disable VSX" OFF) diff --git a/src/arch/helpersve.h b/src/arch/helpersve.h index 853e752ee0f05745ce8dbfd3d1c9f9732f4545ca..60602caebbfccbe56c160266837c18972924cf2d 100644 --- a/src/arch/helpersve.h +++ b/src/arch/helpersve.h @@ -29,6 +29,19 @@ #define ISANAME "AArch64 SVE" #define ptrue svptrue_b8() //@#define ptrue svptrue_b8() +#elif CONFIG == 3 +// Vector length agnostic +#define VECTLENSP (svcntw()) +//@#define VECTLENSP (svcntw()) +#define VECTLENDP (svcntd()) +//@#define VECTLENDP (svcntd()) +#define ISANAME "AArch64 Streaming SVE" +#define ptrue svptrue_b8() +//@#define ptrue svptrue_b8() +#undef FUNC_ATTR +//@#undef FUNC_ATTR +#define FUNC_ATTR STREAM_ATTR +//@#define FUNC_ATTR STREAM_ATTR #elif CONFIG == 8 // 256-bit vector length #define ISANAME "AArch64 SVE 256-bit" @@ -66,9 +79,9 @@ #define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENDP (1 << LOG2VECTLENDP) #define VECTLENSP (1 << LOG2VECTLENSP) -static INLINE int vavailability_i(int name) { return svcntd() >= VECTLENDP ? 3 : 0; } +static INLINE int vavailability_i(int name) FUNC_ATTR { return svcntd() >= VECTLENDP ? 3 : 0; } #else -static INLINE int vavailability_i(int name) { return 3; } +static INLINE int vavailability_i(int name) FUNC_ATTR { return 3; } #endif #define ENABLE_SP @@ -108,26 +121,26 @@ typedef svuint64_t vuint64; // Double-double data type with setter/getter functions typedef svfloat64x2_t vdouble2; -static INLINE vdouble vd2getx_vd_vd2(vdouble2 v) { return svget2_f64(v, 0); } -static INLINE vdouble vd2gety_vd_vd2(vdouble2 v) { return svget2_f64(v, 1); } -static INLINE vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { return svcreate2_f64(x, y); } -static INLINE vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 0, d); } -static INLINE vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 1, d); } +static INLINE vdouble vd2getx_vd_vd2(vdouble2 v) FUNC_ATTR { return svget2_f64(v, 0); } +static INLINE vdouble vd2gety_vd_vd2(vdouble2 v) FUNC_ATTR { return svget2_f64(v, 1); } +static INLINE vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcreate2_f64(x, y); } +static INLINE vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) FUNC_ATTR { return svset2_f64(v, 0, d); } +static INLINE vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) FUNC_ATTR { return svset2_f64(v, 1, d); } // Double-float data type with setter/getter functions typedef svfloat32x2_t vfloat2; -static INLINE vfloat vf2getx_vf_vf2(vfloat2 v) { return svget2_f32(v, 0); } -static INLINE vfloat vf2gety_vf_vf2(vfloat2 v) { return svget2_f32(v, 1); } -static INLINE vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { return svcreate2_f32(x, y); } -static INLINE vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 0, d); } -static INLINE vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 1, d); } +static INLINE vfloat vf2getx_vf_vf2(vfloat2 v) FUNC_ATTR { return svget2_f32(v, 0); } +static INLINE vfloat vf2gety_vf_vf2(vfloat2 v) FUNC_ATTR { return svget2_f32(v, 1); } +static INLINE vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcreate2_f32(x, y); } +static INLINE vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) FUNC_ATTR { return svset2_f32(v, 0, d); } +static INLINE vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) FUNC_ATTR { return svset2_f32(v, 1, d); } typedef svint32x2_t vquad; -static INLINE vmask vqgetx_vm_vq(vquad v) { return svget2_s32(v, 0); } -static INLINE vmask vqgety_vm_vq(vquad v) { return svget2_s32(v, 1); } -static INLINE vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return svcreate2_s32(x, y); } -static INLINE vquad vqsetx_vq_vq_vm(vquad v, vmask x) { return svset2_s32(v, 0, x); } -static INLINE vquad vqsety_vq_vq_vm(vquad v, vmask y) { return svset2_s32(v, 1, y); } +static INLINE vmask vqgetx_vm_vq(vquad v) FUNC_ATTR { return svget2_s32(v, 0); } +static INLINE vmask vqgety_vm_vq(vquad v) FUNC_ATTR { return svget2_s32(v, 1); } +static INLINE vquad vqsetxy_vq_vm_vm(vmask x, vmask y) FUNC_ATTR { return svcreate2_s32(x, y); } +static INLINE vquad vqsetx_vq_vq_vm(vquad v, vmask x) FUNC_ATTR { return svset2_s32(v, 0, x); } +static INLINE vquad vqsety_vq_vq_vm(vquad v, vmask y) FUNC_ATTR { return svset2_s32(v, 1, y); } typedef vquad vargquad; @@ -135,9 +148,9 @@ typedef vquad vargquad; typedef svfloat64x2_t di_t; -static INLINE vdouble digetd_vd_di(di_t d) { return svget2_f64(d, 0); } -static INLINE vint digeti_vi_di(di_t d) { return svreinterpret_s32_f64(svget2_f64(d, 1)); } -static INLINE di_t disetdi_di_vd_vi(vdouble d, vint i) { +static INLINE vdouble digetd_vd_di(di_t d) FUNC_ATTR { return svget2_f64(d, 0); } +static INLINE vint digeti_vi_di(di_t d) FUNC_ATTR { return svreinterpret_s32_f64(svget2_f64(d, 1)); } +static INLINE di_t disetdi_di_vd_vi(vdouble d, vint i) FUNC_ATTR { return svcreate2_f64(d, svreinterpret_f64_s32(i)); } @@ -145,9 +158,9 @@ static INLINE di_t disetdi_di_vd_vi(vdouble d, vint i) { typedef svfloat32x2_t fi_t; -static INLINE vfloat figetd_vf_di(fi_t d) { return svget2_f32(d, 0); } -static INLINE vint2 figeti_vi2_di(fi_t d) { return svreinterpret_s32_f32(svget2_f32(d, 1)); } -static INLINE fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) { +static INLINE vfloat figetd_vf_di(fi_t d) FUNC_ATTR { return svget2_f32(d, 0); } +static INLINE vint2 figeti_vi2_di(fi_t d) FUNC_ATTR { return svreinterpret_s32_f32(svget2_f32(d, 1)); } +static INLINE fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) FUNC_ATTR { return svcreate2_f32(d, svreinterpret_f32_s32(i)); } @@ -155,15 +168,15 @@ static INLINE fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) { typedef svfloat64x3_t ddi_t; -static INLINE vdouble2 ddigetdd_vd2_ddi(ddi_t d) { +static INLINE vdouble2 ddigetdd_vd2_ddi(ddi_t d) FUNC_ATTR { return svcreate2_f64(svget3_f64(d, 0), svget3_f64(d, 1)); } -static INLINE vint ddigeti_vi_ddi(ddi_t d) { return svreinterpret_s32_f64(svget3_f64(d, 2)); } -static INLINE ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) { +static INLINE vint ddigeti_vi_ddi(ddi_t d) FUNC_ATTR { return svreinterpret_s32_f64(svget3_f64(d, 2)); } +static INLINE ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) FUNC_ATTR { return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1), svreinterpret_f64_s32(i)); } -static INLINE ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) { +static INLINE ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) FUNC_ATTR { return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1), svget3_f64(ddi, 2)); } @@ -171,15 +184,15 @@ static INLINE ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) { typedef svfloat32x3_t dfi_t; -static INLINE vfloat2 dfigetdf_vf2_dfi(dfi_t d) { +static INLINE vfloat2 dfigetdf_vf2_dfi(dfi_t d) FUNC_ATTR { return svcreate2_f32(svget3_f32(d, 0), svget3_f32(d, 1)); } -static INLINE vint2 dfigeti_vi2_dfi(dfi_t d) { return svreinterpret_s32_f32(svget3_f32(d, 2)); } -static INLINE dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) { +static INLINE vint2 dfigeti_vi2_dfi(dfi_t d) FUNC_ATTR { return svreinterpret_s32_f32(svget3_f32(d, 2)); } +static INLINE dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) FUNC_ATTR { return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1), svreinterpret_f32_s32(i)); } -static INLINE dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) { +static INLINE dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) FUNC_ATTR { return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1), svget3_f32(dfi, 2)); } @@ -187,14 +200,14 @@ static INLINE dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) { typedef svfloat64x4_t dd2; -static INLINE dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) { +static INLINE dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) FUNC_ATTR { return svcreate4_f64(svget2_f64(a, 0), svget2_f64(a, 1), svget2_f64(b, 0), svget2_f64(b, 1)); } -static INLINE vdouble2 dd2geta_vd2_dd2(dd2 d) { +static INLINE vdouble2 dd2geta_vd2_dd2(dd2 d) FUNC_ATTR { return svcreate2_f64(svget4_f64(d, 0), svget4_f64(d, 1)); } -static INLINE vdouble2 dd2getb_vd2_dd2(dd2 d) { +static INLINE vdouble2 dd2getb_vd2_dd2(dd2 d) FUNC_ATTR { return svcreate2_f64(svget4_f64(d, 2), svget4_f64(d, 3)); } @@ -202,14 +215,14 @@ static INLINE vdouble2 dd2getb_vd2_dd2(dd2 d) { typedef svfloat32x4_t df2; -static INLINE df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) { +static INLINE df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) FUNC_ATTR { return svcreate4_f32(svget2_f32(a, 0), svget2_f32(a, 1), svget2_f32(b, 0), svget2_f32(b, 1)); } -static INLINE vfloat2 df2geta_vf2_df2(df2 d) { +static INLINE vfloat2 df2geta_vf2_df2(df2 d) FUNC_ATTR { return svcreate2_f32(svget4_f32(d, 0), svget4_f32(d, 1)); } -static INLINE vfloat2 df2getb_vf2_df2(df2 d) { +static INLINE vfloat2 df2getb_vf2_df2(df2 d) FUNC_ATTR { return svcreate2_f32(svget4_f32(d, 2), svget4_f32(d, 3)); } @@ -217,44 +230,44 @@ static INLINE vfloat2 df2getb_vf2_df2(df2 d) { typedef svfloat64x3_t vdouble3; -static INLINE vdouble vd3getx_vd_vd3(vdouble3 v) { return svget3_f64(v, 0); } -static INLINE vdouble vd3gety_vd_vd3(vdouble3 v) { return svget3_f64(v, 1); } -static INLINE vdouble vd3getz_vd_vd3(vdouble3 v) { return svget3_f64(v, 2); } -static INLINE vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return svcreate3_f64(x, y, z); } -static INLINE vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 0, d); } -static INLINE vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 1, d); } -static INLINE vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 2, d); } +static INLINE vdouble vd3getx_vd_vd3(vdouble3 v) FUNC_ATTR { return svget3_f64(v, 0); } +static INLINE vdouble vd3gety_vd_vd3(vdouble3 v) FUNC_ATTR { return svget3_f64(v, 1); } +static INLINE vdouble vd3getz_vd_vd3(vdouble3 v) FUNC_ATTR { return svget3_f64(v, 2); } +static INLINE vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) FUNC_ATTR { return svcreate3_f64(x, y, z); } +static INLINE vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) FUNC_ATTR { return svset3_f64(v, 0, d); } +static INLINE vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) FUNC_ATTR { return svset3_f64(v, 1, d); } +static INLINE vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) FUNC_ATTR { return svset3_f64(v, 2, d); } // typedef svfloat64x4_t tdx; -static INLINE vmask tdxgete_vm_tdx(tdx t) { +static INLINE vmask tdxgete_vm_tdx(tdx t) FUNC_ATTR { return svreinterpret_s32_f64(svget4_f64(t, 0)); } -static INLINE vdouble3 tdxgetd3_vd3_tdx(tdx t) { +static INLINE vdouble3 tdxgetd3_vd3_tdx(tdx t) FUNC_ATTR { return svcreate3_f64(svget4_f64(t, 1), svget4_f64(t, 2), svget4_f64(t, 3)); } -static INLINE vdouble tdxgetd3x_vd_tdx(tdx t) { return svget4_f64(t, 1); } -static INLINE vdouble tdxgetd3y_vd_tdx(tdx t) { return svget4_f64(t, 2); } -static INLINE vdouble tdxgetd3z_vd_tdx(tdx t) { return svget4_f64(t, 3); } -static INLINE tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { +static INLINE vdouble tdxgetd3x_vd_tdx(tdx t) FUNC_ATTR { return svget4_f64(t, 1); } +static INLINE vdouble tdxgetd3y_vd_tdx(tdx t) FUNC_ATTR { return svget4_f64(t, 2); } +static INLINE vdouble tdxgetd3z_vd_tdx(tdx t) FUNC_ATTR { return svget4_f64(t, 3); } +static INLINE tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) FUNC_ATTR { return svset4_f64(t, 0, svreinterpret_f64_s32(e)); } -static INLINE tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { +static INLINE tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) FUNC_ATTR { return svcreate4_f64(svget4_f64(t, 0), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2)); } -static INLINE tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { return svset4_f64(t, 1, x); } -static INLINE tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { return svset4_f64(t, 2, y); } -static INLINE tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { return svset4_f64(t, 3, z); } -static INLINE tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) { +static INLINE tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) FUNC_ATTR { return svset4_f64(t, 1, x); } +static INLINE tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) FUNC_ATTR { return svset4_f64(t, 2, y); } +static INLINE tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) FUNC_ATTR { return svset4_f64(t, 3, z); } +static INLINE tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) FUNC_ATTR { return svcreate4_f64(svget4_f64(t, 0), x, y, z); } -static INLINE tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { +static INLINE tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) FUNC_ATTR { return svcreate4_f64(svreinterpret_f64_s32(e), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2)); } -static INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) { +static INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) FUNC_ATTR { return svcreate4_f64(svreinterpret_f64_s32(e), x, y, z); } @@ -262,16 +275,16 @@ static INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdou typedef svfloat64x4_t tdi_t; -static INLINE vdouble3 tdigettd_vd3_tdi(tdi_t d) { +static INLINE vdouble3 tdigettd_vd3_tdi(tdi_t d) FUNC_ATTR { return svcreate3_f64(svget4_f64(d, 0), svget4_f64(d, 1), svget4_f64(d, 2)); } -static INLINE vdouble tdigetx_vd_tdi(tdi_t d) { return svget4_f64(d, 0); } -static INLINE vint tdigeti_vi_tdi(tdi_t d) { return svreinterpret_s32_f64(svget4_f64(d, 3)); } -static INLINE tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) { +static INLINE vdouble tdigetx_vd_tdi(tdi_t d) FUNC_ATTR { return svget4_f64(d, 0); } +static INLINE vint tdigeti_vi_tdi(tdi_t d) FUNC_ATTR { return svreinterpret_s32_f64(svget4_f64(d, 3)); } +static INLINE tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) FUNC_ATTR { return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2), svreinterpret_f64_s32(i)); } -static INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) { +static INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) FUNC_ATTR { return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2), svget4_f64(tdi, 3)); } @@ -283,7 +296,7 @@ static INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) { //@#define ALL_TRUE_MASK svdup_n_s32(0xffffffff) //@#define ALL_FALSE_MASK svdup_n_s32(0x0) -static INLINE void vprefetch_v_p(const void *ptr) {} +static INLINE void vprefetch_v_p(const void *ptr) FUNC_ATTR {} // // @@ -292,12 +305,12 @@ static INLINE void vprefetch_v_p(const void *ptr) {} // // // -static INLINE int vtestallones_i_vo32(vopmask g) { +static INLINE int vtestallones_i_vo32(vopmask g) FUNC_ATTR { svbool_t pg = svptrue_b32(); return (svcntp_b32(pg, g) == svcntw()); } -static INLINE int vtestallones_i_vo64(vopmask g) { +static INLINE int vtestallones_i_vo64(vopmask g) FUNC_ATTR { svbool_t pg = svptrue_b64(); return (svcntp_b64(pg, g) == svcntd()); } @@ -309,56 +322,56 @@ static INLINE int vtestallones_i_vo64(vopmask g) { // // Vector load / store -static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { svst1_s32(ptrue, p, v); } +static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) FUNC_ATTR { svst1_s32(ptrue, p, v); } -static INLINE vfloat vload_vf_p(const float *ptr) { +static INLINE vfloat vload_vf_p(const float *ptr) FUNC_ATTR { return svld1_f32(ptrue, ptr); } -static INLINE vfloat vloadu_vf_p(const float *ptr) { +static INLINE vfloat vloadu_vf_p(const float *ptr) FUNC_ATTR { return svld1_f32(ptrue, ptr); } -static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { +static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) FUNC_ATTR { svst1_f32(ptrue, ptr, v); } // Basic logical operations for mask -static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { +static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) FUNC_ATTR { return svand_s32_x(ptrue, x, y); } -static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { +static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) FUNC_ATTR { return svbic_s32_x(ptrue, y, x); } -static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { +static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) FUNC_ATTR { return svorr_s32_x(ptrue, x, y); } -static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { +static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) FUNC_ATTR { return sveor_s32_x(ptrue, x, y); } -static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { +static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) FUNC_ATTR { return svreinterpret_s32_s64( svadd_s64_x(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y))); } // Mask <--> single precision reinterpret -static INLINE vmask vreinterpret_vm_vf(vfloat vf) { +static INLINE vmask vreinterpret_vm_vf(vfloat vf) FUNC_ATTR { return svreinterpret_s32_f32(vf); } -static INLINE vfloat vreinterpret_vf_vm(vmask vm) { +static INLINE vfloat vreinterpret_vf_vm(vmask vm) FUNC_ATTR { return svreinterpret_f32_s32(vm); } -static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { +static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) FUNC_ATTR { return svreinterpret_f32_s32(vm); } -static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { +static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) FUNC_ATTR { return svreinterpret_s32_f32(vf); } -static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; } -static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; } +static INLINE vint2 vcast_vi2_vm(vmask vm) FUNC_ATTR { return vm; } +static INLINE vmask vcast_vm_vi2(vint2 vi) FUNC_ATTR { return vi; } // Conditional select -static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { +static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) FUNC_ATTR { return svsel_s32(svcmpeq_s32(ptrue, m, ALL_TRUE_MASK), x, y); } @@ -366,82 +379,82 @@ static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { /* Single precision FP operations */ /****************************************/ // Broadcast -static INLINE vfloat vcast_vf_f(float f) { return svdup_n_f32(f); } +static INLINE vfloat vcast_vf_f(float f) FUNC_ATTR { return svdup_n_f32(f); } // Add, Sub, Mul -static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { +static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svadd_f32_x(ptrue, x, y); } -static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { +static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svsub_f32_x(ptrue, x, y); } -static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { +static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svmul_f32_x(ptrue, x, y); } // |x|, -x -static INLINE vfloat vabs_vf_vf(vfloat f) { return svabs_f32_x(ptrue, f); } -static INLINE vfloat vneg_vf_vf(vfloat f) { return svneg_f32_x(ptrue, f); } +static INLINE vfloat vabs_vf_vf(vfloat f) FUNC_ATTR { return svabs_f32_x(ptrue, f); } +static INLINE vfloat vneg_vf_vf(vfloat f) FUNC_ATTR { return svneg_f32_x(ptrue, f); } // max, min -static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { +static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svmax_f32_x(ptrue, x, y); } -static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { +static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svmin_f32_x(ptrue, x, y); } // int <--> float conversions -static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { +static INLINE vint2 vtruncate_vi2_vf(vfloat vf) FUNC_ATTR { return svcvt_s32_f32_x(ptrue, vf); } -static INLINE vfloat vcast_vf_vi2(vint2 vi) { +static INLINE vfloat vcast_vf_vi2(vint2 vi) FUNC_ATTR { return svcvt_f32_s32_x(ptrue, vi); } -static INLINE vint2 vcast_vi2_i(int i) { return svdup_n_s32(i); } -static INLINE vint2 vrint_vi2_vf(vfloat d) { +static INLINE vint2 vcast_vi2_i(int i) FUNC_ATTR { return svdup_n_s32(i); } +static INLINE vint2 vrint_vi2_vf(vfloat d) FUNC_ATTR { return svcvt_s32_f32_x(ptrue, svrintn_f32_x(ptrue, d)); } -#if CONFIG == 1 +#if CONFIG != 2 // Multiply accumulate: z = z + x * y -static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return svmad_f32_x(ptrue, x, y, z); } // Multiply subtract: z = z - x * y -static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return svmsb_f32_x(ptrue, x, y, z); } -static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return svnmsb_f32_x(ptrue, x, y, z); } #else -static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } -static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } -static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #endif // fused multiply add / sub static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, - vfloat z) { // z + x * y + vfloat z) FUNC_ATTR { // z + x * y return svmad_f32_x(ptrue, x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, - vfloat z) { // z - x * y + vfloat z) FUNC_ATTR { // z - x * y return svmsb_f32_x(ptrue, x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, - vfloat z) { // x * y - z + vfloat z) FUNC_ATTR { // x * y - z return svnmsb_f32_x(ptrue, x, y, z); } // conditional select -static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) { +static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) FUNC_ATTR { return svsel_f32(mask, x, y); } // Reciprocal 1/x, Division, Square root -static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { +static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) FUNC_ATTR { #ifndef SLEEF_ENABLE_ALTDIV return svdiv_f32_x(ptrue, n, d); #else @@ -462,7 +475,7 @@ static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { return u; #endif } -static INLINE vfloat vrec_vf_vf(vfloat d) { +static INLINE vfloat vrec_vf_vf(vfloat d) FUNC_ATTR { #ifndef SLEEF_ENABLE_ALTDIV return svdivr_n_f32_x(ptrue, d, 1.0f); #else @@ -470,7 +483,7 @@ static INLINE vfloat vrec_vf_vf(vfloat d) { vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d)); #endif } -static INLINE vfloat vsqrt_vf_vf(vfloat d) { +static INLINE vfloat vsqrt_vf_vf(vfloat d) FUNC_ATTR { #ifndef SLEEF_ENABLE_ALTSQRT return svsqrt_f32_x(ptrue, d); #else @@ -500,15 +513,15 @@ static INLINE vfloat vsqrt_vf_vf(vfloat d) { // // // -static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { +static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) FUNC_ATTR { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } -static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { +static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) FUNC_ATTR { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } -static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { +static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) FUNC_ATTR { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } // @@ -519,7 +532,7 @@ static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2 // // truncate -static INLINE vfloat vtruncate_vf_vf(vfloat vd) { +static INLINE vfloat vtruncate_vf_vf(vfloat vd) FUNC_ATTR { return svrintz_f32_x(ptrue, vd); } @@ -530,7 +543,7 @@ static INLINE vfloat vtruncate_vf_vf(vfloat vd) { // // // -static INLINE vfloat vrint_vf_vf(vfloat vf) { +static INLINE vfloat vrint_vf_vf(vfloat vf) FUNC_ATTR { return svrintn_f32_x(svptrue_b32(), vf); } // @@ -545,25 +558,25 @@ static INLINE vfloat vrint_vf_vf(vfloat vf) { /***************************************/ // Add, Sub, Neg (-x) -static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svadd_s32_x(ptrue, x, y); } -static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svsub_s32_x(ptrue, x, y); } -static INLINE vint2 vneg_vi2_vi2(vint2 e) { return svneg_s32_x(ptrue, e); } +static INLINE vint2 vneg_vi2_vi2(vint2 e) FUNC_ATTR { return svneg_s32_x(ptrue, e); } // Logical operations -static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svand_s32_x(ptrue, x, y); } -static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svbic_s32_x(ptrue, y, x); } -static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svorr_s32_x(ptrue, x, y); } -static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return sveor_s32_x(ptrue, x, y); } @@ -577,12 +590,12 @@ static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { //@#define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c) // Comparison returning integers -static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svsel_s32(svcmpgt_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK); } // conditional select -static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { +static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) FUNC_ATTR { return svsel_s32(m, x, y); } @@ -590,84 +603,108 @@ static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { /* opmask operations */ /****************************************/ // single precision FP -static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { +static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcmpeq_f32(ptrue, x, y); } -static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { +static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcmpne_f32(ptrue, x, y); } -static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { +static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcmplt_f32(ptrue, x, y); } -static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { +static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcmple_f32(ptrue, x, y); } -static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { +static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcmpgt_f32(ptrue, x, y); } -static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { +static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return svcmpge_f32(ptrue, x, y); } -static INLINE vopmask visinf_vo_vf(vfloat d) { +static INLINE vopmask visinf_vo_vf(vfloat d) FUNC_ATTR { return svcmpeq_n_f32(ptrue, vabs_vf_vf(d), SLEEF_INFINITYf); } -static INLINE vopmask vispinf_vo_vf(vfloat d) { +static INLINE vopmask vispinf_vo_vf(vfloat d) FUNC_ATTR { return svcmpeq_n_f32(ptrue, d, SLEEF_INFINITYf); } -static INLINE vopmask visminf_vo_vf(vfloat d) { +static INLINE vopmask visminf_vo_vf(vfloat d) FUNC_ATTR { return svcmpeq_n_f32(ptrue, d, -SLEEF_INFINITYf); } -static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } +static INLINE vopmask visnan_vo_vf(vfloat d) FUNC_ATTR { return vneq_vo_vf_vf(d, d); } // integers -static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { +static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svcmpeq_s32(ptrue, x, y); } -static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { +static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svcmpgt_s32(ptrue, x, y); } // logical opmask -static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { +static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) FUNC_ATTR { +#if CONFIG == 3 // ENABLE_SVESTREAM + svuint8_t xm = svdup_n_u8_z(x, 1); + svuint8_t ym = svdup_n_u8_z(y, 1); + return svcmpeq_n_u8(ptrue, svand_u8_z(ptrue, xm, ym), 1); +#else return svand_b_z(ptrue, x, y); +#endif } -static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { +static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) FUNC_ATTR { +#if CONFIG == 3 // ENABLE_SVESTREAM + svuint8_t xm = svdup_n_u8_z(x, 1); + svuint8_t ym = svdup_n_u8_z(y, 1); + return svcmpeq_n_u8(ptrue, svbic_u8_z(ptrue, ym, xm), 1); +#else return svbic_b_z(ptrue, y, x); +#endif } -static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { +static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) FUNC_ATTR { +#if CONFIG == 3 // ENABLE_SVESTREAM + svuint8_t xm = svdup_n_u8_z(x, 1); + svuint8_t ym = svdup_n_u8_z(y, 1); + return svcmpeq_n_u8(ptrue, svorr_u8_z(ptrue, xm, ym), 1); +#else return svorr_b_z(ptrue, x, y); +#endif } -static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { +static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) FUNC_ATTR { +#if CONFIG == 3 // ENABLE_SVESTREAM + svuint8_t xm = svdup_n_u8_z(x, 1); + svuint8_t ym = svdup_n_u8_z(y, 1); + return svcmpeq_n_u8(ptrue, sveor_u8_z(ptrue, xm, ym), 1); +#else return sveor_b_z(ptrue, x, y); +#endif } -static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { +static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) FUNC_ATTR { // This needs to be zeroing to prevent asinf and atanf denormal test // failing. return svand_s32_z(x, y, y); } // bitmask logical operations -static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { +static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) FUNC_ATTR { return svsel_s32(x, y, ALL_FALSE_MASK); } -static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { +static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) FUNC_ATTR { return svsel_s32(x, ALL_FALSE_MASK, y); } -static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { +static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) FUNC_ATTR { return svsel_s32(x, ALL_TRUE_MASK, y); } // broadcast bitmask -static INLINE vmask vcast_vm_i_i(int i0, int i1) { +static INLINE vmask vcast_vm_i_i(int i0, int i1) FUNC_ATTR { return svreinterpret_s32_u64( svdup_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32))); } -static INLINE vmask vcast_vm_i64(int64_t i) { +static INLINE vmask vcast_vm_i64(int64_t i) FUNC_ATTR { return svreinterpret_s32_u64(svdup_n_u64((uint64_t)i)); } -static INLINE vmask vcast_vm_u64(uint64_t i) { +static INLINE vmask vcast_vm_u64(uint64_t i) FUNC_ATTR { return svreinterpret_s32_u64(svdup_n_u64(i)); } @@ -676,131 +713,131 @@ static INLINE vmask vcast_vm_u64(uint64_t i) { /*********************************/ // Vector load/store -static INLINE vdouble vload_vd_p(const double *ptr) { +static INLINE vdouble vload_vd_p(const double *ptr) FUNC_ATTR { return svld1_f64(ptrue, ptr); } -static INLINE vdouble vloadu_vd_p(const double *ptr) { +static INLINE vdouble vloadu_vd_p(const double *ptr) FUNC_ATTR { return svld1_f64(ptrue, ptr); } -static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { +static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) FUNC_ATTR { svst1_f64(ptrue, ptr, v); } -static INLINE void vstoreu_v_p_vi(int *ptr, vint v) { +static INLINE void vstoreu_v_p_vi(int *ptr, vint v) FUNC_ATTR { svst1w_s64(ptrue, ptr, svreinterpret_s64_s32(v)); } -static vint vloadu_vi_p(int32_t *p) { +static vint vloadu_vi_p(int32_t *p) FUNC_ATTR { return svreinterpret_s32_s64(svld1uw_s64(ptrue, (uint32_t *)p)); } // Reinterpret -static INLINE vdouble vreinterpret_vd_vm(vmask vm) { +static INLINE vdouble vreinterpret_vd_vm(vmask vm) FUNC_ATTR { return svreinterpret_f64_s32(vm); } -static INLINE vmask vreinterpret_vm_vd(vdouble vd) { +static INLINE vmask vreinterpret_vm_vd(vdouble vd) FUNC_ATTR { return svreinterpret_s32_f64(vd); } -static INLINE vint2 vcastu_vm_vi(vint x) { +static INLINE vint2 vcastu_vm_vi(vint x) FUNC_ATTR { return svreinterpret_s32_s64( svlsl_n_s64_x(ptrue, svreinterpret_s64_s32(x), 32)); } -static INLINE vint vcastu_vi_vm(vint2 x) { +static INLINE vint vcastu_vi_vm(vint2 x) FUNC_ATTR { return svreinterpret_s32_u64( svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), 32)); } -static INLINE vdouble vcast_vd_vi(vint vi) { +static INLINE vdouble vcast_vd_vi(vint vi) FUNC_ATTR { return svcvt_f64_s32_x(ptrue, vi); } // Splat -static INLINE vdouble vcast_vd_d(double d) { return svdup_n_f64(d); } +static INLINE vdouble vcast_vd_d(double d) FUNC_ATTR { return svdup_n_f64(d); } // Conditional select -static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { +static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) FUNC_ATTR { return svsel_f64(o, x, y); } -static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { +static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) FUNC_ATTR { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } -static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { +static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) FUNC_ATTR { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } -static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { +static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) FUNC_ATTR { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } -static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { +static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) FUNC_ATTR { return svsel_s32(o, x, y); } // truncate -static INLINE vdouble vtruncate_vd_vd(vdouble vd) { +static INLINE vdouble vtruncate_vd_vd(vdouble vd) FUNC_ATTR { return svrintz_f64_x(ptrue, vd); } -static INLINE vint vtruncate_vi_vd(vdouble vd) { +static INLINE vint vtruncate_vi_vd(vdouble vd) FUNC_ATTR { return svcvt_s32_f64_x(ptrue, vd); } -static INLINE vint vrint_vi_vd(vdouble vd) { +static INLINE vint vrint_vi_vd(vdouble vd) FUNC_ATTR { return svcvt_s32_f64_x(ptrue, svrintn_f64_x(ptrue, vd)); } -static INLINE vdouble vrint_vd_vd(vdouble vd) { +static INLINE vdouble vrint_vd_vd(vdouble vd) FUNC_ATTR { return svrintn_f64_x(ptrue, vd); } // FP math operations -static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { +static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svadd_f64_x(ptrue, x, y); } -static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { +static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svsub_f64_x(ptrue, x, y); } -static INLINE vdouble vneg_vd_vd(vdouble x) { return svneg_f64_x(ptrue, x); } -static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { +static INLINE vdouble vneg_vd_vd(vdouble x) FUNC_ATTR { return svneg_f64_x(ptrue, x); } +static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svmul_f64_x(ptrue, x, y); } -static INLINE vdouble vabs_vd_vd(vdouble x) { return svabs_f64_x(ptrue, x); } -static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { +static INLINE vdouble vabs_vd_vd(vdouble x) FUNC_ATTR { return svabs_f64_x(ptrue, x); } +static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svmax_f64_x(ptrue, x, y); } -static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { +static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svmin_f64_x(ptrue, x, y); } -#if CONFIG == 1 +#if CONFIG != 2 // Multiply accumulate / subtract static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, - vdouble z) { // z = x*y + z + vdouble z) FUNC_ATTR { // z = x*y + z return svmad_f64_x(ptrue, x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, - vdouble z) { // z = x * y - z + vdouble z) FUNC_ATTR { // z = x * y - z return svnmsb_f64_x(ptrue, x, y, z); } -static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { +static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) FUNC_ATTR { return svmsb_f64_x(ptrue, x, y, z); } #else -static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } -static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) FUNC_ATTR { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) FUNC_ATTR { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } #endif static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, - vdouble z) { // z + x * y + vdouble z) FUNC_ATTR { // z + x * y return svmad_f64_x(ptrue, x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, - vdouble z) { // z - x * y + vdouble z) FUNC_ATTR { // z - x * y return svmsb_f64_x(ptrue, x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, - vdouble z) { // x * y - z + vdouble z) FUNC_ATTR { // x * y - z return svnmsb_f64_x(ptrue, x, y, z); } // Reciprocal 1/x, Division, Square root -static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) { +static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) FUNC_ATTR { #ifndef SLEEF_ENABLE_ALTDIV return svdiv_f64_x(ptrue, n, d); #else @@ -822,7 +859,7 @@ static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) { return u; #endif } -static INLINE vdouble vrec_vd_vd(vdouble d) { +static INLINE vdouble vrec_vd_vd(vdouble d) FUNC_ATTR { #ifndef SLEEF_ENABLE_ALTDIV return svdivr_n_f64_x(ptrue, d, 1.0); #else @@ -830,7 +867,7 @@ static INLINE vdouble vrec_vd_vd(vdouble d) { vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d)); #endif } -static INLINE vdouble vsqrt_vd_vd(vdouble d) { +static INLINE vdouble vsqrt_vd_vd(vdouble d) FUNC_ATTR { #ifndef SLEEF_ENABLE_ALTSQRT return svsqrt_f64_x(ptrue, d); #else @@ -858,58 +895,58 @@ static INLINE vdouble vsqrt_vd_vd(vdouble d) { } // Float comparison -static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { +static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcmplt_f64(ptrue, x, y); } -static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { +static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcmpeq_f64(ptrue, x, y); } -static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { +static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcmpgt_f64(ptrue, x, y); } -static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { +static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcmpge_f64(ptrue, x, y); } -static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { +static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcmpne_f64(ptrue, x, y); } -static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { +static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return svcmple_f64(ptrue, x, y); } // predicates -static INLINE vopmask visnan_vo_vd(vdouble vd) { +static INLINE vopmask visnan_vo_vd(vdouble vd) FUNC_ATTR { return svcmpne_f64(ptrue, vd, vd); } -static INLINE vopmask visinf_vo_vd(vdouble vd) { +static INLINE vopmask visinf_vo_vd(vdouble vd) FUNC_ATTR { return svcmpeq_n_f64(ptrue, svabs_f64_x(ptrue, vd), SLEEF_INFINITY); } -static INLINE vopmask vispinf_vo_vd(vdouble vd) { +static INLINE vopmask vispinf_vo_vd(vdouble vd) FUNC_ATTR { return svcmpeq_n_f64(ptrue, vd, SLEEF_INFINITY); } -static INLINE vopmask visminf_vo_vd(vdouble vd) { +static INLINE vopmask visminf_vo_vd(vdouble vd) FUNC_ATTR { return svcmpeq_n_f64(ptrue, vd, -SLEEF_INFINITY); } // Comparing bit masks -static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { +static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) FUNC_ATTR { return svcmpeq_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y)); } // pure predicate operations -static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; } -static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; } -static INLINE vopmask vcast_vo_i(int i) { return svcmpne_s32(ptrue, svdup_n_s32(i), svdup_n_s32(0)); } +static INLINE vopmask vcast_vo32_vo64(vopmask o) FUNC_ATTR { return o; } +static INLINE vopmask vcast_vo64_vo32(vopmask o) FUNC_ATTR { return o; } +static INLINE vopmask vcast_vo_i(int i) FUNC_ATTR { return svcmpne_s32(ptrue, svdup_n_s32(i), svdup_n_s32(0)); } // logical integer operations -static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { +static INLINE vint vand_vi_vo_vi(vopmask x, vint y) FUNC_ATTR { // This needs to be a zeroing instruction because we need to make // sure that the inactive elements for the unpacked integers vector // are zero. return svand_s32_z(x, y, y); } -static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { +static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) FUNC_ATTR { return svsel_s32(x, ALL_FALSE_MASK, y); } #define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c) @@ -917,68 +954,69 @@ static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { #define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c) //@#define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c) -static INLINE vint vsrl_vi_vi_i(vint x, int c) { +static INLINE vint vsrl_vi_vi_i(vint x, int c) FUNC_ATTR { return svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c)); } -static INLINE vint vand_vi_vi_vi(vint x, vint y) { +static INLINE vint vand_vi_vi_vi(vint x, vint y) FUNC_ATTR { return svand_s32_x(ptrue, x, y); } -static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { +static INLINE vint vandnot_vi_vi_vi(vint x, vint y) FUNC_ATTR { return svbic_s32_x(ptrue, y, x); } -static INLINE vint vxor_vi_vi_vi(vint x, vint y) { +static INLINE vint vxor_vi_vi_vi(vint x, vint y) FUNC_ATTR { return sveor_s32_x(ptrue, x, y); } // integer math -static INLINE vint vadd_vi_vi_vi(vint x, vint y) { +static INLINE vint vadd_vi_vi_vi(vint x, vint y) FUNC_ATTR { return svadd_s32_x(ptrue, x, y); } -static INLINE vint vsub_vi_vi_vi(vint x, vint y) { +static INLINE vint vsub_vi_vi_vi(vint x, vint y) FUNC_ATTR { return svsub_s32_x(ptrue, x, y); } -static INLINE vint vneg_vi_vi(vint x) { return svneg_s32_x(ptrue, x); } +static INLINE vint vneg_vi_vi(vint x) FUNC_ATTR { return svneg_s32_x(ptrue, x); } // integer comparison -static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { +static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) FUNC_ATTR { return svcmpgt_s32(ptrue, x, y); } -static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { +static INLINE vopmask veq_vo_vi_vi(vint x, vint y) FUNC_ATTR { return svcmpeq_s32(ptrue, x, y); } // Splat -static INLINE vint vcast_vi_i(int i) { return svdup_n_s32(i); } +static INLINE vint vcast_vi_i(int i) FUNC_ATTR { return svdup_n_s32(i); } // bitmask logical operations -static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { +static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) FUNC_ATTR { // This needs to be a zeroing instruction because we need to make // sure that the inactive elements for the unpacked integers vector // are zero. return svreinterpret_s32_s64( svand_s64_z(x, svreinterpret_s64_s32(y), svreinterpret_s64_s32(y))); } -static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { +static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) FUNC_ATTR { return svreinterpret_s32_s64(svsel_s64( x, svreinterpret_s64_s32(ALL_FALSE_MASK), svreinterpret_s64_s32(y))); } -static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { +static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) FUNC_ATTR { return svreinterpret_s32_s64(svsel_s64( x, svreinterpret_s64_s32(ALL_TRUE_MASK), svreinterpret_s64_s32(y))); } -static INLINE vfloat vrev21_vf_vf(vfloat vf) { +static INLINE vfloat vrev21_vf_vf(vfloat vf) FUNC_ATTR { return svreinterpret_f32_u64(svrevw_u64_x(ptrue, svreinterpret_u64_f32(vf))); } // Comparison returning integer -static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { +static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) FUNC_ATTR { return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK); } // Gather +// vgather is non-streaming-compatible and must be called from non-streaming mode static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return svld1_gather_s64index_f64(ptrue, ptr, svreinterpret_s64_s32(vi)); } @@ -987,40 +1025,90 @@ static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return svld1_gather_s32index_f32(ptrue, ptr, vi2); } +// Switch to non-streaming mode for rempi so that gather can be performed +#if CONFIG == 3 // ENABLE_SVESTREAM +static NOINLINE void rempif_nostream(const float *a, float *res, dfi_t (*rempif_core)(vfloat)) { + const uint64_t svl = res[0]; + uint64_t i = 0; + svbool_t pg; + while (svptest_any(svptrue_b32(), pg = svwhilelt_b32_u64(i, svl))) { + svfloat32_t va = svld1_f32(pg, a + i); + dfi_t dfi = (*rempif_core)(va); + svst3_f32(pg, res + i * 3, dfi); + i += svcntw(); + } +} + +// Streaming mode and non-streaming mode can have different vector lengths. +// When passing vectors across function boundaries, vectors are stored/loaded +// as scalar arrays instead. +static INLINE dfi_t rempif_stream(vfloat va, dfi_t (*rempif_core)(vfloat)) FUNC_ATTR { + const uint64_t svl = svcntw(); + float a[svl], res[svl * 3]; + svst1_f32(svptrue_b32(), a, va); + // Passing streaming vector length in an output pointer to force evaluation in streaming mode + // as a temporary workaround for a bug in gcc. + res[0] = svl; + rempif_nostream(a, res, rempif_core); + return svld3_f32(svptrue_b32(), res); +} + +static NOINLINE void rempi_nostream(const double *a, double *res, ddi_t (*rempi_core)(vdouble)) { + const uint64_t svl = res[0]; + uint64_t i = 0; + svbool_t pg; + while (svptest_any(svptrue_b64(), pg = svwhilelt_b64_u64(i, svl))) { + svfloat64_t va = svld1_f64(pg, a + i); + ddi_t ddi = (*rempi_core)(va); + svst3_f64(pg, res + i * 3, ddi); + i += svcntd(); + } +} + +static INLINE ddi_t rempi_stream(vdouble va, ddi_t (*rempi_core)(vdouble)) FUNC_ATTR { + const uint64_t svl = svcntd(); + double a[svl], res[svl * 3]; + svst1_f64(svptrue_b64(), a, va); + res[0] = svl; + rempi_nostream(a, res, rempi_core); + return svld3_f64(svptrue_b64(), res); +} +#endif + // Operations for DFT -static INLINE vdouble vposneg_vd_vd(vdouble d) { +static INLINE vdouble vposneg_vd_vd(vdouble d) FUNC_ATTR { return svneg_f64_m(d, svdupq_n_b64(0, 1), d); } -static INLINE vdouble vnegpos_vd_vd(vdouble d) { +static INLINE vdouble vnegpos_vd_vd(vdouble d) FUNC_ATTR { return svneg_f64_m(d, svdupq_n_b64(1, 0), d); } -static INLINE vfloat vposneg_vf_vf(vfloat d) { +static INLINE vfloat vposneg_vf_vf(vfloat d) FUNC_ATTR { return svneg_f32_m(d, svdupq_n_b32(0, 1, 0, 1), d); } -static INLINE vfloat vnegpos_vf_vf(vfloat d) { +static INLINE vfloat vnegpos_vf_vf(vfloat d) FUNC_ATTR { return svneg_f32_m(d, svdupq_n_b32(1, 0, 1, 0), d); } -static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } -static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); } -static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } -static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } +static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } +static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) FUNC_ATTR { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); } +static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) FUNC_ATTR { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } +static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } // -static INLINE vdouble vrev21_vd_vd(vdouble x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); } +static INLINE vdouble vrev21_vd_vd(vdouble x) FUNC_ATTR { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); } -static INLINE vdouble vreva2_vd_vd(vdouble vd) { +static INLINE vdouble vreva2_vd_vd(vdouble vd) FUNC_ATTR { svint64_t x = svindex_s64((VECTLENDP-1), -1); x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x)); return svtbl_f64(vd, svreinterpret_u64_s64(x)); } -static INLINE vfloat vreva2_vf_vf(vfloat vf) { +static INLINE vfloat vreva2_vf_vf(vfloat vf) FUNC_ATTR { svint32_t x = svindex_s32((VECTLENSP-1), -1); x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x)); return svtbl_f32(vf, svreinterpret_u32_s32(x)); @@ -1036,33 +1124,33 @@ static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat svst1_scatter_u32index_f32(ptrue, ptr + offset*2, svzip1_u32(svindex_u32(0, step*2), svindex_u32(1, step*2)), v); } -static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vstoreu_v_p_vd(ptr, v); } -static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); } -static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v); } -static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } -static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); } -static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } +static INLINE void vstore_v_p_vd(double *ptr, vdouble v) FUNC_ATTR { vstoreu_v_p_vd(ptr, v); } +static INLINE void vstream_v_p_vd(double *ptr, vdouble v) FUNC_ATTR { vstore_v_p_vd(ptr, v); } +static INLINE void vstore_v_p_vf(float *ptr, vfloat v) FUNC_ATTR { vstoreu_v_p_vf(ptr, v); } +static INLINE void vstream_v_p_vf(float *ptr, vfloat v) FUNC_ATTR { vstore_v_p_vf(ptr, v); } +static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) FUNC_ATTR { vscatter2_v_p_i_i_vd(ptr, offset, step, v); } +static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) FUNC_ATTR { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } // These functions are for debugging -static double vcast_d_vd(vdouble v) { +static double vcast_d_vd(vdouble v) FUNC_ATTR { double a[svcntd()]; vstoreu_v_p_vd(a, v); return a[0]; } -static float vcast_f_vf(vfloat v) { +static float vcast_f_vf(vfloat v) FUNC_ATTR { float a[svcntw()]; vstoreu_v_p_vf(a, v); return a[0]; } -static int vcast_i_vi(vint v) { +static int vcast_i_vi(vint v) FUNC_ATTR { int a[svcntw()]; vstoreu_v_p_vi(a, v); return a[0]; } -static int vcast_i_vi2(vint2 v) { +static int vcast_i_vi2(vint2 v) FUNC_ATTR { int a[svcntw()]; vstoreu_v_p_vi2(a, v); return a[0]; @@ -1070,34 +1158,34 @@ static int vcast_i_vi2(vint2 v) { // -static vquad loadu_vq_p(const int32_t *ptr) { +static vquad loadu_vq_p(const int32_t *ptr) FUNC_ATTR { int32_t a[svcntw()*2]; memcpy(a, ptr, svcntw()*8); return svld2_s32(ptrue, a); } -static INLINE vquad cast_vq_aq(vargquad aq) { return aq; } -static INLINE vargquad cast_aq_vq(vquad vq) { return vq; } +static INLINE vquad cast_vq_aq(vargquad aq) FUNC_ATTR { return aq; } +static INLINE vargquad cast_aq_vq(vquad vq) FUNC_ATTR { return vq; } -static INLINE int vtestallzeros_i_vo64(vopmask g) { +static INLINE int vtestallzeros_i_vo64(vopmask g) FUNC_ATTR { return svcntp_b64(svptrue_b64(), g) == 0; } -static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { +static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) FUNC_ATTR { return svreinterpret_s32_s64(svsel_s64(o, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y))); } -static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { +static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) FUNC_ATTR { return svreinterpret_s32_s64( svsub_s64_x(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y))); } -static INLINE vmask vneg64_vm_vm(vmask x) { +static INLINE vmask vneg64_vm_vm(vmask x) FUNC_ATTR { return svreinterpret_s32_s64(svneg_s64_x(ptrue, svreinterpret_s64_s32(x))); } -static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { +static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) FUNC_ATTR { return svcmpgt_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y)); } @@ -1106,10 +1194,10 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { #define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c)) //@#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c)) -static INLINE vmask vcast_vm_vi(vint vi) { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); } -static INLINE vint vcast_vi_vm(vmask vm) { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); } +static INLINE vmask vcast_vm_vi(vint vi) FUNC_ATTR { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); } +static INLINE vint vcast_vi_vm(vmask vm) FUNC_ATTR { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); } -static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return svreinterpret_s32_s64(v); } -static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return svreinterpret_s64_s32(m); } -static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return svreinterpret_s32_u64(v); } -static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return svreinterpret_u64_s32(m); } +static INLINE vmask vreinterpret_vm_vi64(vint64 v) FUNC_ATTR { return svreinterpret_s32_s64(v); } +static INLINE vint64 vreinterpret_vi64_vm(vmask m) FUNC_ATTR { return svreinterpret_s64_s32(m); } +static INLINE vmask vreinterpret_vm_vu64(vuint64 v) FUNC_ATTR { return svreinterpret_s32_u64(v); } +static INLINE vuint64 vreinterpret_vu64_vm(vmask m) FUNC_ATTR { return svreinterpret_u64_s32(m); } diff --git a/src/common/commonfuncs.h b/src/common/commonfuncs.h index 19ec746bef1d277277f2d43724644f0a519cbf2e..f6de2a3cdab044b2a01fa67dc84bdeeac8e2c0a8 100644 --- a/src/common/commonfuncs.h +++ b/src/common/commonfuncs.h @@ -3,21 +3,21 @@ // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) typedef struct { vdouble x, y, z; } vdouble3; -static INLINE CONST VECTOR_CC vdouble vd3getx_vd_vd3(vdouble3 v) { return v.x; } -static INLINE CONST VECTOR_CC vdouble vd3gety_vd_vd3(vdouble3 v) { return v.y; } -static INLINE CONST VECTOR_CC vdouble vd3getz_vd_vd3(vdouble3 v) { return v.z; } -static INLINE CONST VECTOR_CC vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) { +static INLINE CONST VECTOR_CC vdouble vd3getx_vd_vd3(vdouble3 v) FUNC_ATTR { return v.x; } +static INLINE CONST VECTOR_CC vdouble vd3gety_vd_vd3(vdouble3 v) FUNC_ATTR { return v.y; } +static INLINE CONST VECTOR_CC vdouble vd3getz_vd_vd3(vdouble3 v) FUNC_ATTR { return v.z; } +static INLINE CONST VECTOR_CC vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) FUNC_ATTR { vdouble3 v = { x, y, z }; return v; } -static INLINE CONST VECTOR_CC vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { v.x = d; return v; } -static INLINE CONST VECTOR_CC vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { v.y = d; return v; } -static INLINE CONST VECTOR_CC vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { v.z = d; return v; } +static INLINE CONST VECTOR_CC vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) FUNC_ATTR { v.x = d; return v; } +static INLINE CONST VECTOR_CC vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) FUNC_ATTR { v.y = d; return v; } +static INLINE CONST VECTOR_CC vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) FUNC_ATTR { v.z = d; return v; } // @@ -25,12 +25,12 @@ typedef struct { vdouble2 a, b; } dd2; -static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) { +static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) FUNC_ATTR { dd2 r = { a, b }; return r; } -static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; } -static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; } +static vdouble2 dd2geta_vd2_dd2(dd2 d) FUNC_ATTR { return d.a; } +static vdouble2 dd2getb_vd2_dd2(dd2 d) FUNC_ATTR { return d.b; } // @@ -39,31 +39,31 @@ typedef struct { vdouble3 d3; } tdx; -static INLINE CONST VECTOR_CC vmask tdxgete_vm_tdx(tdx t) { return t.e; } -static INLINE CONST VECTOR_CC vdouble3 tdxgetd3_vd3_tdx(tdx t) { return t.d3; } -static INLINE CONST VECTOR_CC vdouble tdxgetd3x_vd_tdx(tdx t) { return t.d3.x; } -static INLINE CONST VECTOR_CC vdouble tdxgetd3y_vd_tdx(tdx t) { return t.d3.y; } -static INLINE CONST VECTOR_CC vdouble tdxgetd3z_vd_tdx(tdx t) { return t.d3.z; } -static INLINE CONST VECTOR_CC tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { t.e = e; return t; } -static INLINE CONST VECTOR_CC tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { t.d3 = d3; return t; } -static INLINE CONST VECTOR_CC tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { t.d3.x = x; return t; } -static INLINE CONST VECTOR_CC tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { t.d3.y = y; return t; } -static INLINE CONST VECTOR_CC tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { t.d3.z = z; return t; } -static INLINE CONST VECTOR_CC tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) { +static INLINE CONST VECTOR_CC vmask tdxgete_vm_tdx(tdx t) FUNC_ATTR { return t.e; } +static INLINE CONST VECTOR_CC vdouble3 tdxgetd3_vd3_tdx(tdx t) FUNC_ATTR { return t.d3; } +static INLINE CONST VECTOR_CC vdouble tdxgetd3x_vd_tdx(tdx t) FUNC_ATTR { return t.d3.x; } +static INLINE CONST VECTOR_CC vdouble tdxgetd3y_vd_tdx(tdx t) FUNC_ATTR { return t.d3.y; } +static INLINE CONST VECTOR_CC vdouble tdxgetd3z_vd_tdx(tdx t) FUNC_ATTR { return t.d3.z; } +static INLINE CONST VECTOR_CC tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) FUNC_ATTR { t.e = e; return t; } +static INLINE CONST VECTOR_CC tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) FUNC_ATTR { t.d3 = d3; return t; } +static INLINE CONST VECTOR_CC tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) FUNC_ATTR { t.d3.x = x; return t; } +static INLINE CONST VECTOR_CC tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) FUNC_ATTR { t.d3.y = y; return t; } +static INLINE CONST VECTOR_CC tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) FUNC_ATTR { t.d3.z = z; return t; } +static INLINE CONST VECTOR_CC tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) FUNC_ATTR { t.d3 = (vdouble3) { x, y, z }; return t; } -static INLINE CONST VECTOR_CC tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return (tdx) { e, d3 }; } -static INLINE CONST VECTOR_CC tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) { +static INLINE CONST VECTOR_CC tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return (tdx) FUNC_ATTR { e, d3 }; } +static INLINE CONST VECTOR_CC tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) FUNC_ATTR { return (tdx) { e, (vdouble3) { x, y, z } }; } -static INLINE CONST VECTOR_CC vmask vqgetx_vm_vq(vquad v) { return v.x; } -static INLINE CONST VECTOR_CC vmask vqgety_vm_vq(vquad v) { return v.y; } -static INLINE CONST VECTOR_CC vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return (vquad) { x, y }; } -static INLINE CONST VECTOR_CC vquad vqsetx_vq_vq_vm(vquad v, vmask x) { v.x = x; return v; } -static INLINE CONST VECTOR_CC vquad vqsety_vq_vq_vm(vquad v, vmask y) { v.y = y; return v; } +static INLINE CONST VECTOR_CC vmask vqgetx_vm_vq(vquad v) FUNC_ATTR { return v.x; } +static INLINE CONST VECTOR_CC vmask vqgety_vm_vq(vquad v) FUNC_ATTR { return v.y; } +static INLINE CONST VECTOR_CC vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return (vquad) FUNC_ATTR { x, y }; } +static INLINE CONST VECTOR_CC vquad vqsetx_vq_vq_vm(vquad v, vmask x) FUNC_ATTR { v.x = x; return v; } +static INLINE CONST VECTOR_CC vquad vqsety_vq_vq_vm(vquad v, vmask y) FUNC_ATTR { v.y = y; return v; } // @@ -72,9 +72,9 @@ typedef struct { vint i; } di_t; -static INLINE CONST VECTOR_CC vdouble digetd_vd_di(di_t d) { return d.d; } -static INLINE CONST VECTOR_CC vint digeti_vi_di(di_t d) { return d.i; } -static INLINE CONST VECTOR_CC di_t disetdi_di_vd_vi(vdouble d, vint i) { +static INLINE CONST VECTOR_CC vdouble digetd_vd_di(di_t d) FUNC_ATTR { return d.d; } +static INLINE CONST VECTOR_CC vint digeti_vi_di(di_t d) FUNC_ATTR { return d.i; } +static INLINE CONST VECTOR_CC di_t disetdi_di_vd_vi(vdouble d, vint i) FUNC_ATTR { di_t r = { d, i }; return r; } @@ -86,13 +86,13 @@ typedef struct { vint i; } ddi_t; -static INLINE CONST VECTOR_CC vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; } -static INLINE CONST VECTOR_CC vint ddigeti_vi_ddi(ddi_t d) { return d.i; } -static INLINE CONST VECTOR_CC ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) { +static INLINE CONST VECTOR_CC vdouble2 ddigetdd_vd2_ddi(ddi_t d) FUNC_ATTR { return d.dd; } +static INLINE CONST VECTOR_CC vint ddigeti_vi_ddi(ddi_t d) FUNC_ATTR { return d.i; } +static INLINE CONST VECTOR_CC ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) FUNC_ATTR { ddi_t r = { v, i }; return r; } -static INLINE CONST VECTOR_CC ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) { +static INLINE CONST VECTOR_CC ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) FUNC_ATTR { ddi.dd = v; return ddi; } @@ -104,10 +104,10 @@ typedef struct { vint i; } tdi_t; -static INLINE CONST VECTOR_CC vdouble3 tdigettd_vd3_tdi(tdi_t d) { return d.td; } -static INLINE CONST VECTOR_CC vdouble tdigetx_vd_tdi(tdi_t d) { return d.td.x; } -static INLINE CONST VECTOR_CC vint tdigeti_vi_tdi(tdi_t d) { return d.i; } -static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) { +static INLINE CONST VECTOR_CC vdouble3 tdigettd_vd3_tdi(tdi_t d) FUNC_ATTR { return d.td; } +static INLINE CONST VECTOR_CC vdouble tdigetx_vd_tdi(tdi_t d) FUNC_ATTR { return d.td.x; } +static INLINE CONST VECTOR_CC vint tdigeti_vi_tdi(tdi_t d) FUNC_ATTR { return d.i; } +static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) FUNC_ATTR { tdi_t r = { v, i }; return r; } @@ -118,7 +118,7 @@ static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) { #include #include -static void printvmask(char *mes, vmask g) { +static void printvmask(char *mes, vmask g) FUNC_ATTR { uint64_t u[VECTLENDP]; vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(g)); printf("%s ", mes); @@ -127,7 +127,7 @@ static void printvmask(char *mes, vmask g) { } #if !defined(ENABLE_SVE) -static void printvopmask(char *mes, vopmask g) { +static void printvopmask(char *mes, vopmask g) FUNC_ATTR { union { vopmask g; uint8_t u[sizeof(vopmask)]; @@ -137,13 +137,13 @@ static void printvopmask(char *mes, vopmask g) { printf("\n"); } #else -static void printvopmask(char *mes, vopmask g) { +static void printvopmask(char *mes, vopmask g) FUNC_ATTR { vmask m = vand_vm_vo64_vm(g, vcast_vm_i64(-1)); printvmask(mes, m); } #endif -static void printvdouble(char *mes, vdouble vd) { +static void printvdouble(char *mes, vdouble vd) FUNC_ATTR { double u[VECTLENDP]; vstoreu_v_p_vd((double *)u, vd); printf("%s ", mes); @@ -151,7 +151,7 @@ static void printvdouble(char *mes, vdouble vd) { printf("\n"); } -static void printvint(char *mes, vint vi) { +static void printvint(char *mes, vint vi) FUNC_ATTR { uint32_t u[VECTLENDP]; vstoreu_v_p_vi((int32_t *)u, vi); printf("%s ", mes); @@ -159,7 +159,7 @@ static void printvint(char *mes, vint vi) { printf("\n"); } -static void printvint64(char *mes, vint64 vi) { +static void printvint64(char *mes, vint64 vi) FUNC_ATTR { uint64_t u[VECTLENDP*2]; vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vreinterpret_vm_vi64(vi))); printf("%s ", mes); @@ -167,7 +167,7 @@ static void printvint64(char *mes, vint64 vi) { printf("\n"); } -static void printvquad(char *mes, vquad g) { +static void printvquad(char *mes, vquad g) FUNC_ATTR { uint64_t u[VECTLENDP*2]; vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vqgetx_vm_vq(g))); vstoreu_v_p_vd((double *)&u[VECTLENDP], vreinterpret_vd_vm(vqgety_vm_vq(g))); @@ -181,57 +181,57 @@ static void printvquad(char *mes, vquad g) { // vdouble functions -static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) { +static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) FUNC_ATTR { return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } -static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) { +static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) FUNC_ATTR { return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x)); } -static INLINE CONST vopmask visnonfinite_vo_vd(vdouble x) { +static INLINE CONST vopmask visnonfinite_vo_vd(vdouble x) FUNC_ATTR { return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(INT64_C(0x7ff0000000000000))), vcast_vm_i64(INT64_C(0x7ff0000000000000))); } -static INLINE CONST vmask vsignbit_vm_vd(vdouble d) { +static INLINE CONST vmask vsignbit_vm_vd(vdouble d) FUNC_ATTR { return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } -static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) { +static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) FUNC_ATTR { return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } -static INLINE CONST vdouble vclearlsb_vd_vd_i(vdouble d, int n) { +static INLINE CONST vdouble vclearlsb_vd_vd_i(vdouble d, int n) FUNC_ATTR { return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_u64((~UINT64_C(0)) << n))); } -static INLINE CONST VECTOR_CC vdouble vtoward0_vd_vd(vdouble x) { // returns nextafter(x, 0) +static INLINE CONST VECTOR_CC vdouble vtoward0_vd_vd(vdouble x) FUNC_ATTR { // returns nextafter(x, 0) vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(-1))); return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t); } #if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) -static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { +static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y))); } #endif -static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) FUNC_ATTR { return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d); } #if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) -static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y))); } -static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) FUNC_ATTR { return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)), vand_vm_vm_vm (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y)))); } #endif -static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) { +static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vtruncate_vd_vd(x); #else @@ -241,21 +241,21 @@ static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) { #endif } -static INLINE CONST VECTOR_CC vdouble vfloor2_vd_vd(vdouble x) { +static INLINE CONST VECTOR_CC vdouble vfloor2_vd_vd(vdouble x) FUNC_ATTR { vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr); return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x)); } -static INLINE CONST VECTOR_CC vdouble vceil2_vd_vd(vdouble x) { +static INLINE CONST VECTOR_CC vdouble vceil2_vd_vd(vdouble x) FUNC_ATTR { vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0))); return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x)); } -static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) FUNC_ATTR { vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5)); vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); @@ -265,7 +265,7 @@ static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) { return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d)); } -static INLINE CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vrint_vd_vd(d); #else @@ -275,11 +275,11 @@ static INLINE CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) { #endif } -static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) { +static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) FUNC_ATTR { return veq_vo_vd_vd(vrint2_vd_vd(d), d); } -static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) { +static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) FUNC_ATTR { vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5)); return vneq_vo_vd_vd(vrint2_vd_vd(x), x); } @@ -287,7 +287,7 @@ static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) { // ilogb #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) -static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) { +static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); vint q = vcastu_vi_vm(vreinterpret_vm_vd(d)); @@ -297,7 +297,7 @@ static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) { return q; } -static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) { +static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) FUNC_ATTR { vint q = vcastu_vi_vm(vreinterpret_vm_vd(d)); q = vsrl_vi_vi_i(q, 20); q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff)); @@ -306,7 +306,7 @@ static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) { } #endif -static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) { +static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) FUNC_ATTR { vmask m = vreinterpret_vm_vd(d); m = vsrl64_vm_vm_i(m, 20 + 32); m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff)); @@ -314,7 +314,7 @@ static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) { return m; } -static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) { +static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) FUNC_ATTR { vmask m = vreinterpret_vm_vd(d); m = vsrl64_vm_vm_i(m, 20 + 32); m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff)); @@ -323,18 +323,18 @@ static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) { // ldexp -static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) { +static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) FUNC_ATTR { q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q); vmask r = vcastu_vm_vi(vsll_vi_vi_i(q, 20)); return vreinterpret_vd_vm(r); } -static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vm(vmask q) { +static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vm(vmask q) FUNC_ATTR { q = vadd64_vm_vm_vm(vcast_vm_i64(0x3ff), q); return vreinterpret_vd_vm(vsll64_vm_vm_i(q, 52)); } -static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) { +static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) FUNC_ATTR { vint m = vsra_vi_vi_i(q, 31); m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7); q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2)); @@ -346,15 +346,15 @@ static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) { return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); } -static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) { +static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) FUNC_ATTR { return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1)))); } -static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) { +static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) FUNC_ATTR { return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vcastu_vm_vi(vsll_vi_vi_i(q, 20)))); } -static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) { +static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) FUNC_ATTR { vmask m = vsrl64_vm_vm_i(e, 2); e = vsub64_vm_vm_vm(vsub64_vm_vm_vm(vsub64_vm_vm_vm(e, m), m), m); d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m)); @@ -364,32 +364,32 @@ static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) { return d; } -static INLINE CONST vdouble vldexp2_vd_vd_vm(vdouble d, vmask e) { +static INLINE CONST vdouble vldexp2_vd_vd_vm(vdouble d, vmask e) FUNC_ATTR { return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vm(vsrl64_vm_vm_i(e, 1))), vpow2i_vd_vm(vsub64_vm_vm_vm(e, vsrl64_vm_vm_i(e, 1)))); } -static INLINE CONST vdouble vldexp3_vd_vd_vm(vdouble d, vmask q) { +static INLINE CONST vdouble vldexp3_vd_vd_vm(vdouble d, vmask q) FUNC_ATTR { return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vsll64_vm_vm_i(q, 52))); } // vmask functions -static INLINE CONST vdouble vcast_vd_vm(vmask m) { return vcast_vd_vi(vcast_vi_vm(m)); } // 32 bit only -static INLINE CONST vmask vtruncate_vm_vd(vdouble d) { return vcast_vm_vi(vtruncate_vi_vd(d)); } +static INLINE CONST vdouble vcast_vd_vm(vmask m) FUNC_ATTR { return vcast_vd_vi(vcast_vi_vm(m)); } // 32 bit only +static INLINE CONST vmask vtruncate_vm_vd(vdouble d) FUNC_ATTR { return vcast_vm_vi(vtruncate_vi_vd(d)); } -static INLINE CONST vopmask vlt64_vo_vm_vm(vmask x, vmask y) { return vgt64_vo_vm_vm(y, x); } +static INLINE CONST vopmask vlt64_vo_vm_vm(vmask x, vmask y) FUNC_ATTR { return vgt64_vo_vm_vm(y, x); } -static INLINE CONST vopmask vnot_vo64_vo64(vopmask x) { +static INLINE CONST vopmask vnot_vo64_vo64(vopmask x) FUNC_ATTR { return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i64(0), vcast_vm_i64(0))); } -static INLINE CONST vopmask vugt64_vo_vm_vm(vmask x, vmask y) { // unsigned compare +static INLINE CONST vopmask vugt64_vo_vm_vm(vmask x, vmask y) FUNC_ATTR { // unsigned compare x = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), x); y = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), y); return vgt64_vo_vm_vm(x, y); } -static INLINE CONST vmask vilogbk_vm_vd(vdouble d) { +static INLINE CONST vmask vilogbk_vm_vd(vdouble d) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(4.9090934652977266E-91)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); vmask q = vreinterpret_vm_vd(d); @@ -401,18 +401,18 @@ static INLINE CONST vmask vilogbk_vm_vd(vdouble d) { // vquad functions -static INLINE CONST vquad sel_vq_vo_vq_vq(vopmask o, vquad x, vquad y) { +static INLINE CONST vquad sel_vq_vo_vq_vq(vopmask o, vquad x, vquad y) FUNC_ATTR { return vqsetxy_vq_vm_vm(vsel_vm_vo64_vm_vm(o, vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vsel_vm_vo64_vm_vm(o, vqgety_vm_vq(x), vqgety_vm_vq(y))); } -static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) { +static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) FUNC_ATTR { vquad r = vqsetxy_vq_vm_vm(vadd64_vm_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vadd64_vm_vm_vm(vqgety_vm_vq(x), vqgety_vm_vq(y))); r = vqsety_vq_vq_vm(r, vadd64_vm_vm_vm(vqgety_vm_vq(r), vand_vm_vo64_vm(vugt64_vo_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(r)), vcast_vm_i64(1)))); return r; } -static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; } +static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) FUNC_ATTR { vquad r = vqsetxy_vq_vm_vm(x, y); return r; } // imm must be smaller than 64 #define srl128_vq_vq_i(m, imm) \ @@ -420,7 +420,7 @@ static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_v // This function is equivalent to : // di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) }; -static INLINE CONST di_t rempisub(vdouble x) { +static INLINE CONST di_t rempisub(vdouble x) FUNC_ATTR { #ifdef FULL_FP_ROUNDING vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4))); vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4)))); diff --git a/src/common/dd.h b/src/common/dd.h index 3f3ed4882091e7c3f816c9bd589f8f0410a60852..bb945d04aa1146bf4dc8c17dc96a68625f67c19a 100644 --- a/src/common/dd.h +++ b/src/common/dd.h @@ -3,7 +3,7 @@ // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) #if !defined(SLEEF_ENABLE_CUDA) typedef struct { vdouble x, y; @@ -12,11 +12,11 @@ typedef struct { typedef double2 vdouble2; #endif -static INLINE CONST VECTOR_CC vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; } -static INLINE CONST VECTOR_CC vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; } -static INLINE CONST VECTOR_CC vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { vdouble2 v; v.x = x; v.y = y; return v; } -static INLINE CONST VECTOR_CC vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; } -static INLINE CONST VECTOR_CC vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; } +static INLINE CONST VECTOR_CC vdouble vd2getx_vd_vd2(vdouble2 v) FUNC_ATTR { return v.x; } +static INLINE CONST VECTOR_CC vdouble vd2gety_vd_vd2(vdouble2 v) FUNC_ATTR { return v.y; } +static INLINE CONST VECTOR_CC vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { vdouble2 v; v.x = x; v.y = y; return v; } +static INLINE CONST VECTOR_CC vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) FUNC_ATTR { v.x = d; return v; } +static INLINE CONST VECTOR_CC vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) FUNC_ATTR { v.y = d; return v; } #endif #if !defined(SLEEF_ENABLE_CUDA) @@ -25,159 +25,159 @@ typedef struct { } double2; #endif -static INLINE CONST VECTOR_CC double2 dd(double h, double l) { +static INLINE CONST VECTOR_CC double2 dd(double h, double l) FUNC_ATTR { double2 ret = { h, l }; return ret; } -static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) FUNC_ATTR { return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000))); } -static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) { +static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) FUNC_ATTR { return vd2setxy_vd2_vd_vd(h, l); } -static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) { +static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) FUNC_ATTR { return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l)); } -static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d2(double2 dd) { +static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d2(double2 dd) FUNC_ATTR { return vd2setxy_vd2_vd_vd(vcast_vd_d(dd.x), vcast_vd_d(dd.y)); } -static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) FUNC_ATTR { return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)), vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))); } -static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) { +static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) FUNC_ATTR { return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0), vsel_vd_vo_d_d(o, y1, y0)); } -static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { +static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) FUNC_ATTR { return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2); } -static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { +static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) FUNC_ATTR { return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3); } -static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { +static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) FUNC_ATTR { return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4); } -static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { +static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) FUNC_ATTR { return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5); } -static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) { +static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) FUNC_ATTR { return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6); } -static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { +static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) FUNC_ATTR { return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2); } -static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { +static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) FUNC_ATTR { return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3); } -static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { +static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) FUNC_ATTR { return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4); } -static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { +static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) FUNC_ATTR { return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5); } // -static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) FUNC_ATTR { return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x))); } -static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) FUNC_ATTR { return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)), vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)), vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)), vreinterpret_vm_vd(vcast_vd_d(-0.0)))))); } -static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) { +static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t)); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t))); } -static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) { +static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) FUNC_ATTR { return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s)); } -static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_d(vdouble2 d, double s) { return ddscale_vd2_vd2_vd(d, vcast_vd_d(s)); } +static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_d(vdouble2 d, double s) FUNC_ATTR { return ddscale_vd2_vd2_vd(d, vcast_vd_d(s)); } -static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y)); } -static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(x, y); vdouble v = vsub_vd_vd_vd(s, x); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v))); } -static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x))); } -static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) FUNC_ATTR { vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x))); } -static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y); vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x)); vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x))); } -static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y))); } -static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y)); vdouble v = vsub_vd_vd_vd(s, x); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y))); } -static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { // |x| >= |y| vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))); } -static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x)); vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)))); } -static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { // |x| >= |y| vdouble s = vsub_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y)); } -static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { // |x| >= |y| vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); @@ -188,7 +188,7 @@ static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) } #ifdef ENABLE_FMA_DP -static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) FUNC_ATTR { vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d)); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t); vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s); @@ -196,45 +196,45 @@ static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u))); } -static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { vdouble s = vmul_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s)); } -static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) FUNC_ATTR { vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s))); } -static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s)))); } -static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)))); } -static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) FUNC_ATTR { return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)))); } -static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) FUNC_ATTR { vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s))); } -static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) FUNC_ATTR { vdouble s = vrec_vd_vd(d); return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1)))); } -static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) FUNC_ATTR { vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d)); return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1))))); } #else // #ifdef ENABLE_FMA_DP -static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) FUNC_ATTR { vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d)); vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh); vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); @@ -248,7 +248,7 @@ static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u)); } -static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) FUNC_ATTR { vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh); vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh); @@ -256,7 +256,7 @@ static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl))); } -static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { +static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) FUNC_ATTR { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble yh = vupper_vd_vd(y ), yl = vsub_vd_vd_vd(y , yh); @@ -264,7 +264,7 @@ static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y))); } -static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh); @@ -272,27 +272,27 @@ static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y)))); } -static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) { +static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) FUNC_ATTR { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh); return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh)); } -static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) FUNC_ATTR { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x))))); } -static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) FUNC_ATTR { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh)); } -static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) FUNC_ATTR { vdouble t = vrec_vd_vd(d); vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh); vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th); @@ -300,7 +300,7 @@ static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) { return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl)))); } -static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) FUNC_ATTR { vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d)); vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh); vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); @@ -309,16 +309,16 @@ static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) { } #endif // #ifdef ENABLE_FMA_DP -static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) FUNC_ATTR { vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d))); return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5)); } -static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) { +static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) FUNC_ATTR { vdouble t = vsqrt_vd_vd(d); return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5)); } -static INLINE CONST VECTOR_CC vdouble2 ddmla_vd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y, vdouble2 z) { +static INLINE CONST VECTOR_CC vdouble2 ddmla_vd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y, vdouble2 z) FUNC_ATTR { return ddadd_vd2_vd2_vd2(z, ddmul_vd2_vd2_vd2(x, y)); } diff --git a/src/common/df.h b/src/common/df.h index d1ab7a77d0dbc2af26c6923228a1f84b9a6b9144..ef96c5a09f137a19dca2bb9be7bddabd4d458be8 100644 --- a/src/common/df.h +++ b/src/common/df.h @@ -3,7 +3,7 @@ // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) #if !defined(SLEEF_ENABLE_CUDA) typedef struct { vfloat x, y; @@ -12,164 +12,164 @@ typedef struct { typedef float2 vfloat2; #endif -static INLINE CONST VECTOR_CC vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; } -static INLINE CONST VECTOR_CC vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } -static INLINE CONST VECTOR_CC vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { vfloat2 v; v.x = x; v.y = y; return v; } -static INLINE CONST VECTOR_CC vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; } -static INLINE CONST VECTOR_CC vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; } +static INLINE CONST VECTOR_CC vfloat vf2getx_vf_vf2(vfloat2 v) FUNC_ATTR { return v.x; } +static INLINE CONST VECTOR_CC vfloat vf2gety_vf_vf2(vfloat2 v) FUNC_ATTR { return v.y; } +static INLINE CONST VECTOR_CC vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { vfloat2 v; v.x = x; v.y = y; return v; } +static INLINE CONST VECTOR_CC vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) FUNC_ATTR { v.x = d; return v; } +static INLINE CONST VECTOR_CC vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) FUNC_ATTR { v.y = d; return v; } #endif -static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) { +static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) FUNC_ATTR { return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000))); } -static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) { +static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) FUNC_ATTR { return vf2setxy_vf2_vf_vf(h, l); } -static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) { +static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) FUNC_ATTR { return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l)); } -static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) { +static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) FUNC_ATTR { return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d)); } -static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) FUNC_ATTR { return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))); } -static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) { +static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) FUNC_ATTR { return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0)); } -static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { +static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) FUNC_ATTR { return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2))); } -static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { +static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) FUNC_ATTR { return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3)))); } -static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) FUNC_ATTR { return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))), vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x))))); } -static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { +static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) FUNC_ATTR { return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2); } -static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { +static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) FUNC_ATTR { return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3); } -static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { +static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) FUNC_ATTR { return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4); } -static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) { +static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) FUNC_ATTR { return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5); } -static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) { +static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) FUNC_ATTR { return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6); } -static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { +static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) FUNC_ATTR { return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2); } -static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { +static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) FUNC_ATTR { return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3); } -static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { +static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) FUNC_ATTR { return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4); } // -static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) FUNC_ATTR { return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x))); } -static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) FUNC_ATTR { return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)), vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f)))))); } -static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) { +static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t)); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t))); } -static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) { +static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) FUNC_ATTR { return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s)); } -static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(x, y); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y)); } -static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(x, y); vfloat v = vsub_vf_vf_vf(s, x); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v))); } -static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y)); vfloat v = vsub_vf_vf_vf(s, x); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y))); } -static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y); return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x))); } -static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) FUNC_ATTR { vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x))); } -static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y); vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x)); vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x))); } -static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y)); return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y))); } -static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { // |x| >= |y| vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))); } -static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x)); vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)))); } -static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { // |x| >= |y| vfloat s = vsub_vf_vf_vf(x, y); return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y)); } -static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { // |x| >= |y| vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); @@ -180,7 +180,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { } #ifdef ENABLE_FMA_SP -static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) FUNC_ATTR { vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d)); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t); vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s); @@ -188,45 +188,45 @@ static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u))); } -static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { vfloat s = vmul_vf_vf_vf(x, y); return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s)); } -static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) FUNC_ATTR { vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s))); } -static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) FUNC_ATTR { return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)))); } -static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s)))); } -static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y)))); } -static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) FUNC_ATTR { vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s))); } -static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) { +static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) FUNC_ATTR { vfloat s = vrec_vf_vf(d); return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1)))); } -static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) FUNC_ATTR { vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d)); return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1))))); } #else -static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) FUNC_ATTR { vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d)); vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh); vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); @@ -251,7 +251,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u)); } -static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) FUNC_ATTR { vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh); vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh); @@ -265,7 +265,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { return vf2setxy_vf2_vf_vf(s, t); } -static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) FUNC_ATTR { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat yh = vupper_vf_vf(y ), yl = vsub_vf_vf_vf(y , yh); @@ -280,7 +280,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { return vf2setxy_vf2_vf_vf(s, t); } -static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh); @@ -296,14 +296,14 @@ static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { return vf2setxy_vf2_vf_vf(s, t); } -static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) { +static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) FUNC_ATTR { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh); return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh)); } -static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) FUNC_ATTR { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t; @@ -316,13 +316,13 @@ static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) { return vf2setxy_vf2_vf_vf(s, t); } -static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) FUNC_ATTR { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh)); } -static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) { +static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) FUNC_ATTR { vfloat t = vrec_vf_vf(d); vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh); vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th); @@ -336,7 +336,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) { return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u)); } -static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) FUNC_ATTR { vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d)); vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh); vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); @@ -352,7 +352,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) { } #endif -static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) FUNC_ATTR { #ifdef ENABLE_RECSQRT_SP vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d))); vfloat2 r = dfmul_vf2_vf2_vf(d, x); @@ -363,7 +363,7 @@ static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) { #endif } -static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) { +static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) FUNC_ATTR { vfloat t = vsqrt_vf_vf(d); return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f)); } diff --git a/src/common/keywords.txt b/src/common/keywords.txt index 659fa5e1034862cc0ab50f5829d2fbf84ceab6d0..816919c6fc3dae10e7c5c1e6621a22baea2153b0 100644 --- a/src/common/keywords.txt +++ b/src/common/keywords.txt @@ -116,7 +116,9 @@ poly4dd poly4df pragma rempi +rempi_core rempif +rempif_core rempisub rempisubf sinpifk diff --git a/src/common/main_checkfeature.c b/src/common/main_checkfeature.c index b5d7b9a07f3a16c9f98701ac6d647fea760607e4..3ac7145490c7636c8fffbb8199a69e0094dea965 100644 --- a/src/common/main_checkfeature.c +++ b/src/common/main_checkfeature.c @@ -19,8 +19,14 @@ static sigjmp_buf sigjmp; #define LONGJMP siglongjmp #endif +#if defined(__ARM_FEATURE_SME) +#include +int main2(int argc, char **argv) __arm_streaming; +int check_feature(double, float) __arm_streaming; +#else int main2(int argc, char **argv); int check_feature(double, float); +#endif static void sighandler(int signum) { LONGJMP(sigjmp, 1); diff --git a/src/common/misc.h b/src/common/misc.h index 6b571cb7feb274201ba7a6d5451b84a2225459ec..2f21411d5df49f8c9ae0240b709f708a0fd98271 100644 --- a/src/common/misc.h +++ b/src/common/misc.h @@ -211,6 +211,7 @@ typedef struct { #if defined(SLEEF_GENHEADER) #define INLINE SLEEF_ALWAYS_INLINE +#define NOINLINE SLEEF_NOINLINE #define EXPORT SLEEF_INLINE #define CONST SLEEF_CONST #define NOEXPORT @@ -219,6 +220,7 @@ typedef struct { #define CONST __attribute__((const)) #define INLINE __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) #ifndef SLEEF_STATIC_LIBS @@ -255,6 +257,7 @@ typedef struct { #if defined(SLEEF_GENHEADER) #define INLINE SLEEF_ALWAYS_INLINE +#define NOINLINE SLEEF_NOINLINE #define CONST SLEEF_CONST #define EXPORT SLEEF_INLINE #define NOEXPORT @@ -262,6 +265,7 @@ typedef struct { #else // #if defined(SLEEF_GENHEADER) #define INLINE __forceinline +#define NOINLINE __declspec(noinline) #define CONST #ifndef SLEEF_STATIC_LIBS #define EXPORT __declspec(dllexport) @@ -318,12 +322,20 @@ typedef struct { #endif // #ifndef __MISC_H__ +// Set AAVPCS only if explicitely enabled #ifdef ENABLE_AAVPCS #define VECTOR_CC __attribute__((aarch64_vector_pcs)) #else #define VECTOR_CC #endif +// Streaming attribute is passed to function attributes only through CONFIG +#define STREAM_ATTR __arm_streaming_compatible + +// Set default function attribute to empty. +// Vector extensions that need special attribute should redefine it. +#define FUNC_ATTR + // #if defined (__GNUC__) && !defined(__INTEL_COMPILER) diff --git a/src/libm-tester/CMakeLists.txt b/src/libm-tester/CMakeLists.txt index 320a2a9f5605106bb21b339d7891f2e9a291b54c..7f539f45f245b545540725d23717a90e4c00b3c8 100644 --- a/src/libm-tester/CMakeLists.txt +++ b/src/libm-tester/CMakeLists.txt @@ -16,6 +16,7 @@ set(TESTER3_DEFINITIONS_ADVSIMD ATR=finz_ DPTYPE=float64x2_t SPTYPE=float3 set(TESTER3_DEFINITIONS_ADVSIMDNOFMA ATR=cinz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimdnofma) set(TESTER3_DEFINITIONS_SVE ATR=finz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=sve) set(TESTER3_DEFINITIONS_SVENOFMA ATR=cinz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=svenofma) +set(TESTER3_DEFINITIONS_SVESTREAM ATR=finz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=svestream) set(TESTER3_DEFINITIONS_VSX ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx) set(TESTER3_DEFINITIONS_VSXNOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsxnofma) @@ -42,7 +43,7 @@ if (SLEEF_ARCH_X86) set(TEST3_FINZ purecfma_scalar avx2128 avx2 avx512f) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") set(TEST3_CINZ purec_scalar advsimdnofma svenofma) - set(TEST3_FINZ purecfma_scalar advsimd sve) + set(TEST3_FINZ purecfma_scalar advsimd sve svestream) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") set(TEST3_CINZ purec_scalar) set(TEST3_FINZ purecfma_scalar) diff --git a/src/libm-tester/gnuabi_compatibility.c b/src/libm-tester/gnuabi_compatibility.c index 3e168c5e571231b68196b5a6ebbdb5307c80f243..9ab522105eba9474125758d1f7f6eded7e150d8a 100644 --- a/src/libm-tester/gnuabi_compatibility.c +++ b/src/libm-tester/gnuabi_compatibility.c @@ -26,6 +26,7 @@ #define VLEN_SP 4 #define VLEN_DP 2 #define VECTOR_CC +#define FUNC_ATTR typedef __m128i vopmask; typedef __m128d vdouble; @@ -41,6 +42,7 @@ typedef __m128i vint2; #define VLEN_SP 8 #define VLEN_DP 4 #define VECTOR_CC +#define FUNC_ATTR typedef __m256i vopmask; typedef __m256d vdouble; @@ -56,6 +58,7 @@ typedef struct { __m128i x, y; } vint2; #define VLEN_SP 8 #define VLEN_DP 4 #define VECTOR_CC +#define FUNC_ATTR typedef __m256i vopmask; typedef __m256d vdouble; @@ -71,6 +74,7 @@ typedef __m256i vint2; #define VLEN_SP 16 #define VLEN_DP 8 #define VECTOR_CC +#define FUNC_ATTR typedef __mmask16 vopmask; typedef __m512d vdouble; @@ -90,6 +94,7 @@ typedef __m512i vint2; #else #define VECTOR_CC #endif +#define FUNC_ATTR typedef uint32x4_t vopmask; typedef float64x2_t vdouble; @@ -105,6 +110,7 @@ typedef int32x4_t vint2; #define VLEN_DP (svcntd()) #define VLA_TOKEN x #define VECTOR_CC +#define FUNC_ATTR typedef svbool_t vopmask; typedef svfloat64_t vdouble; @@ -113,78 +119,101 @@ typedef svint32_t vint; typedef svint32_t vint2; #endif /* ENABLE_SVE */ +#ifdef ENABLE_SVESTREAM +#include +#define ISA_TOKEN c +#define VLEN_SP (svcntw()) +#define VLEN_DP (svcntd()) +#define VLA_TOKEN x +#define VECTOR_CC +#define FUNC_ATTR __arm_streaming_compatible + +typedef svbool_t vopmask; +typedef svfloat64_t vdouble; +typedef svfloat32_t vfloat; +typedef svint32_t vint; +typedef svint32_t vint2; +#endif /* ENABLE_SVESTREAM */ + +// Function attribute for test helper functions +#ifdef ENABLE_SVESTREAM +#define TEST_FUNC_ATTR __arm_streaming +#else +#define TEST_FUNC_ATTR +#endif + // GNUABI name mangling macro. #ifndef MASKED_GNUABI #define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name #define __DECLARE_vd_vd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble) FUNC_ATTR #define __CALL_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0) #define __DECLARE_vi_vd(name, t, vl, p) \ - extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble) + extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble) FUNC_ATTR #define __CALL_vi_vd(name, t, vl, p) \ do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0) #define __DECLARE_vd_vd_vi(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint) FUNC_ATTR #define __CALL_vd_vd_vi(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0) #define __DECLARE_vd_vd_vd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble) FUNC_ATTR #define __CALL_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0) #define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble) FUNC_ATTR #define __CALL_vd_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0) #define __DECLARE_vd_vd_pvd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *) FUNC_ATTR #define __CALL_vd_vd_pvd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0) #define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \ - extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *) + extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *) FUNC_ATTR #define __CALL_v_vd_pvd_pvd(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0) #define __DECLARE_vf_vf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat) FUNC_ATTR #define __CALL_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0) #define __DECLARE_vf_vf_vf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat) FUNC_ATTR #define __CALL_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0) #define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat) FUNC_ATTR #define __CALL_vf_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0) #define __DECLARE_vf_vf_pvf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *) FUNC_ATTR #define __CALL_vf_vf_pvf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0) #define __DECLARE_vi_vf(name, t, vl, p) \ - extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat) + extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat) FUNC_ATTR #define __CALL_vi_vf(name, t, vl, p) \ do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0) #define __DECLARE_vf_vf_vi(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2) FUNC_ATTR #define __CALL_vf_vf_vi(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0) #define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*) FUNC_ATTR #define __CALL_v_vf_pvf_pvf(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0) @@ -193,72 +222,72 @@ typedef svint32_t vint2; #define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name #define __DECLARE_vd_vd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask) FUNC_ATTR #define __CALL_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0) #define __DECLARE_vi_vd(name, t, vl, p) \ - extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask) + extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask) FUNC_ATTR #define __CALL_vi_vd(name, t, vl, p) \ do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0) #define __DECLARE_vd_vd_vi(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask) FUNC_ATTR #define __CALL_vd_vd_vi(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0) #define __DECLARE_vd_vd_vd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask) FUNC_ATTR #define __CALL_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0) #define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask) FUNC_ATTR #define __CALL_vd_vd_vd_vd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0) #define __DECLARE_vd_vd_pvd(name, t, vl, p) \ - extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask) + extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask) FUNC_ATTR #define __CALL_vd_vd_pvd(name, t, vl, p) \ do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0) #define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \ - extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask) + extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask) FUNC_ATTR #define __CALL_v_vd_pvd_pvd(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0) #define __DECLARE_vf_vf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask) FUNC_ATTR #define __CALL_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0) #define __DECLARE_vf_vf_vf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask) FUNC_ATTR #define __CALL_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0) #define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask) FUNC_ATTR #define __CALL_vf_vf_vf_vf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0) #define __DECLARE_vf_vf_pvf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask) FUNC_ATTR #define __CALL_vf_vf_pvf(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0) #define __DECLARE_vi_vf(name, t, vl, p) \ - extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask) + extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask) FUNC_ATTR #define __CALL_vi_vf(name, t, vl, p) \ do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0) #define __DECLARE_vf_vf_vi(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask) FUNC_ATTR #define __CALL_vf_vf_vi(name, t, vl, p) \ do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0) #define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \ - extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask) + extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask) FUNC_ATTR #define __CALL_v_vf_pvf_pvf(name, t, vl, p) \ do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0) @@ -283,13 +312,13 @@ typedef svint32_t vint2; #error "Missing VLEN_SP" #endif -#if defined(ENABLE_SVE) && !defined(VLA_TOKEN) +#if (defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM)) && !defined(VLA_TOKEN) #error "Missing VLA_TOKEN" -#endif /* defined(ENABLE_SVE) && !defined(VLA_TOKEN) */ +#endif /* (defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM)) && !defined(VLA_TOKEN) */ // Declaration and call, first level expantion to pick up the // ISA_TOKEN and VLEN_* architectural macros. -#ifndef ENABLE_SVE +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM)) #define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLEN_DP, p) #define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLEN_DP, p) @@ -333,7 +362,7 @@ typedef svint32_t vint2; #define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p) #define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p) -#else /* ENABLE_SVE */ +#else /* !(defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM)) */ #define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p) #define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0) @@ -377,7 +406,7 @@ typedef svint32_t vint2; #define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p) #define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2) -#endif /* ENABLE_SVE */ +#endif /* !(defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM)) */ // @@ -528,7 +557,7 @@ DECLARE_SP_vf_vf(tanhf, v); DECLARE_SP_vf_vf(tgammaf, v); DECLARE_SP_vf_vf(truncf, v); -#ifndef ENABLE_SVE +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM)) vdouble vd0, vd1, vd2, vd3; vfloat vf0, vf1, vf2, vf3; vint vi0, vi1, vi2, vi3; @@ -538,8 +567,8 @@ vopmask mask; volatile char outbuf[1024]; #endif -int check_feature(double d, float f) { -#ifdef ENABLE_SVE +int check_feature(double d, float f) TEST_FUNC_ATTR { +#if defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM) vdouble vd0 = svdup_n_f64(d), vd1 = svdup_n_f64(d); #ifdef MASKED_GNUABI vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(f), svdup_n_s32(0)); @@ -547,14 +576,14 @@ int check_feature(double d, float f) { #endif CALL_DP_vd_vd(__acos_finite, v); -#ifdef ENABLE_SVE +#if defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM) svst1_f64(svptrue_b8(), (double *)outbuf, vd0); #endif return 1; } -int main2(int argc, char **argv) { -#ifdef ENABLE_SVE +int main2(int argc, char **argv) TEST_FUNC_ATTR { +#if defined(ENABLE_SVE) || defined(ENABLE_SVESTREAM) vdouble vd0 = svdup_n_f64(argc), vd1 = svdup_n_f64(argc), vd2 = svdup_n_f64(argc), vd3 = svdup_n_f64(argc); vfloat vf0 = svdup_n_f32(argc), vf1 = svdup_n_f32(argc), vf2 = svdup_n_f32(argc), vf3 = svdup_n_f32(argc); vint vi0 = svdup_n_s32(argc), vi2 = svdup_n_s32(argc); diff --git a/src/libm-tester/iutsimd.c b/src/libm-tester/iutsimd.c index a5cd6258437349ea8d09dd02998c4e11dfd7a249..08189db30e90ea2df1bc550827df29c8496a4320 100644 --- a/src/libm-tester/iutsimd.c +++ b/src/libm-tester/iutsimd.c @@ -57,6 +57,10 @@ #include #endif +#if defined(__ARM_FEATURE_SME) +#include +#endif + #if defined(__riscv) && defined(__riscv_v) #include #endif @@ -69,8 +73,8 @@ #include #endif -#define SLEEF_ALWAYS_INLINE inline -#define SLEEF_INLINE +#define SLEEF_ALWAYS_INLINE INLINE +#define SLEEF_INLINE INLINE #define SLEEF_CONST #include USE_INLINE_HEADER #include MACRO_ONLY_HEADER @@ -239,6 +243,14 @@ typedef Sleef___m128_2 vfloat2; #endif #endif +#ifdef ENABLE_SVESTREAM +#include "renamesvestream.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 3 +#include "helpersve.h" +#endif +#endif + #ifdef ENABLE_DSP256 #define CONFIG 1 #include "helperavx.h" @@ -432,7 +444,13 @@ typedef Sleef_float_2 vfloat2; // -int check_feature(double d, float f) { +#ifdef __ARM_FEATURE_SME +#define TEST_FUNC_ATTR __arm_streaming +#else +#define TEST_FUNC_ATTR +#endif + +int check_feature(double d, float f) TEST_FUNC_ATTR { #ifdef ENABLE_DP { double s[VECTLENDP]; @@ -462,12 +480,12 @@ int check_feature(double d, float f) { return 0; } -#if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER)) +#if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER)) static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; } static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; } #endif -#if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER)) +#if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER)) static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; } static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } #endif @@ -646,7 +664,7 @@ static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } #define BUFSIZE 1024 -int main2(int argc, char **argv) { +int main2(int argc, char **argv) TEST_FUNC_ATTR { xsrand(time(NULL)); { diff --git a/src/libm-tester/tester2simddp.c b/src/libm-tester/tester2simddp.c index f236c3b9c602234201ae5ed10b177cad004a3f6b..e8391fc3fd038ce9749f09a3d6793fe8e772d170 100644 --- a/src/libm-tester/tester2simddp.c +++ b/src/libm-tester/tester2simddp.c @@ -127,6 +127,12 @@ typedef Sleef_float32x4_t_2 vfloat2; #include "renamesvenofma.h" #endif +#ifdef ENABLE_SVESTREAM +#define CONFIG 3 +#include "helpersve.h" +#include "renamesvestream.h" +#endif + #ifdef ENABLE_VSX #define CONFIG 1 #include "helperpower_128.h" @@ -241,7 +247,7 @@ typedef Sleef_float_2 vfloat2; // -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; } static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; } #endif diff --git a/src/libm-tester/tester2simdsp.c b/src/libm-tester/tester2simdsp.c index b6574337ddbcdc6694110a71a057ac51258aa762..7eaab51b04110c9e92642822c152e526013f46da 100644 --- a/src/libm-tester/tester2simdsp.c +++ b/src/libm-tester/tester2simdsp.c @@ -127,6 +127,12 @@ typedef Sleef_float32x4_t_2 vfloat2; #include "renamesvenofma.h" #endif +#ifdef ENABLE_SVESTREAM +#define CONFIG 3 +#include "helpersve.h" +#include "renamesvestream.h" +#endif + #ifdef ENABLE_VSX #define CONFIG 1 #include "helperpower_128.h" @@ -241,7 +247,7 @@ typedef Sleef_float_2 vfloat2; // -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; } static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } #endif diff --git a/src/libm-tester/tester3.c b/src/libm-tester/tester3.c index 865dd22861fb45d6c83a29b3d3831525567efb36..eed0acee78bc56a4cfe18a0188870e0c51e054e3 100644 --- a/src/libm-tester/tester3.c +++ b/src/libm-tester/tester3.c @@ -34,71 +34,79 @@ typedef __attribute__((vector_size(16))) double vector_double; typedef __attribute__((vector_size(16))) float vector_float; #endif +#ifdef __ARM_FEATURE_SME +#undef FUNC_ATTR +#define FUNC_ATTR __arm_streaming_compatible +#define TEST_FUNC_ATTR __arm_streaming +#else +#define TEST_FUNC_ATTR +#endif + // #define XNAN (((union { int64_t u; double d; }) { .u = INT64_C(0xffffffffffffffff) }).d) #define XNANf (((union { int32_t u; float d; }) { .u = 0xffffffff }).d) -static INLINE double unifyValue(double x) { x = !(x == x) ? XNAN : x; return x; } -static INLINE float unifyValuef(float x) { x = !(x == x) ? XNANf : x; return x; } +static INLINE double unifyValue(double x) FUNC_ATTR { x = !(x == x) ? XNAN : x; return x; } +static INLINE float unifyValuef(float x) FUNC_ATTR { x = !(x == x) ? XNANf : x; return x; } -static INLINE double setdouble(double d, int r) { return d; } -static INLINE double getdouble(double v, int r) { return unifyValue(v); } -static INLINE float setfloat(float d, int r) { return d; } -static INLINE float getfloat(float v, int r) { return unifyValuef(v); } +static INLINE double setdouble(double d, int r) FUNC_ATTR { return d; } +static INLINE double getdouble(double v, int r) FUNC_ATTR { return unifyValue(v); } +static INLINE float setfloat(float d, int r) FUNC_ATTR { return d; } +static INLINE float getfloat(float v, int r) FUNC_ATTR { return unifyValuef(v); } #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) -static INLINE __m128d set__m128d(double d, int r) { static double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return _mm_loadu_pd(a); } -static INLINE double get__m128d(__m128d v, int r) { static double a[2]; _mm_storeu_pd(a, v); return unifyValue(a[r & 1]); } -static INLINE __m128 set__m128(float d, int r) { static float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return _mm_loadu_ps(a); } -static INLINE float get__m128(__m128 v, int r) { static float a[4]; _mm_storeu_ps(a, v); return unifyValuef(a[r & 3]); } +static INLINE __m128d set__m128d(double d, int r) FUNC_ATTR { static double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return _mm_loadu_pd(a); } +static INLINE double get__m128d(__m128d v, int r) FUNC_ATTR { static double a[2]; _mm_storeu_pd(a, v); return unifyValue(a[r & 1]); } +static INLINE __m128 set__m128(float d, int r) FUNC_ATTR { static float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return _mm_loadu_ps(a); } +static INLINE float get__m128(__m128 v, int r) FUNC_ATTR { static float a[4]; _mm_storeu_ps(a, v); return unifyValuef(a[r & 3]); } #if defined(__AVX__) -static INLINE __m256d set__m256d(double d, int r) { static double a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return _mm256_loadu_pd(a); } -static INLINE double get__m256d(__m256d v, int r) { static double a[4]; _mm256_storeu_pd(a, v); return unifyValue(a[r & 3]); } -static INLINE __m256 set__m256(float d, int r) { static float a[8]; memrand(a, sizeof(a)); a[r & 7] = d; return _mm256_loadu_ps(a); } -static INLINE float get__m256(__m256 v, int r) { static float a[8]; _mm256_storeu_ps(a, v); return unifyValuef(a[r & 7]); } +static INLINE __m256d set__m256d(double d, int r) FUNC_ATTR { static double a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return _mm256_loadu_pd(a); } +static INLINE double get__m256d(__m256d v, int r) FUNC_ATTR { static double a[4]; _mm256_storeu_pd(a, v); return unifyValue(a[r & 3]); } +static INLINE __m256 set__m256(float d, int r) FUNC_ATTR { static float a[8]; memrand(a, sizeof(a)); a[r & 7] = d; return _mm256_loadu_ps(a); } +static INLINE float get__m256(__m256 v, int r) FUNC_ATTR { static float a[8]; _mm256_storeu_ps(a, v); return unifyValuef(a[r & 7]); } #endif #if defined(__AVX512F__) -static INLINE __m512d set__m512d(double d, int r) { static double a[8]; memrand(a, sizeof(a)); a[r & 7] = d; return _mm512_loadu_pd(a); } -static INLINE double get__m512d(__m512d v, int r) { static double a[8]; _mm512_storeu_pd(a, v); return unifyValue(a[r & 7]); } -static INLINE __m512 set__m512(float d, int r) { static float a[16]; memrand(a, sizeof(a)); a[r & 15] = d; return _mm512_loadu_ps(a); } -static INLINE float get__m512(__m512 v, int r) { static float a[16]; _mm512_storeu_ps(a, v); return unifyValuef(a[r & 15]); } +static INLINE __m512d set__m512d(double d, int r) FUNC_ATTR { static double a[8]; memrand(a, sizeof(a)); a[r & 7] = d; return _mm512_loadu_pd(a); } +static INLINE double get__m512d(__m512d v, int r) FUNC_ATTR { static double a[8]; _mm512_storeu_pd(a, v); return unifyValue(a[r & 7]); } +static INLINE __m512 set__m512(float d, int r) FUNC_ATTR { static float a[16]; memrand(a, sizeof(a)); a[r & 15] = d; return _mm512_loadu_ps(a); } +static INLINE float get__m512(__m512 v, int r) FUNC_ATTR { static float a[16]; _mm512_storeu_ps(a, v); return unifyValuef(a[r & 15]); } #endif #endif // #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) -#if defined(__aarch64__) && defined(__ARM_NEON) -static INLINE VECTOR_CC float64x2_t setfloat64x2_t(double d, int r) { double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return vld1q_f64(a); } -static INLINE VECTOR_CC double getfloat64x2_t(float64x2_t v, int r) { double a[2]; vst1q_f64(a, v); return unifyValue(a[r & 1]); } -static INLINE VECTOR_CC float32x4_t setfloat32x4_t(float d, int r) { float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return vld1q_f32(a); } -static INLINE VECTOR_CC float getfloat32x4_t(float32x4_t v, int r) { float a[4]; vst1q_f32(a, v); return unifyValuef(a[r & 3]); } +#if defined(__aarch64__) && defined(__ARM_NEON) && !defined(__ARM_FEATURE_SME) +static INLINE VECTOR_CC float64x2_t setfloat64x2_t(double d, int r) FUNC_ATTR { double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return vld1q_f64(a); } +static INLINE VECTOR_CC double getfloat64x2_t(float64x2_t v, int r) FUNC_ATTR { double a[2]; vst1q_f64(a, v); return unifyValue(a[r & 1]); } +static INLINE VECTOR_CC float32x4_t setfloat32x4_t(float d, int r) FUNC_ATTR { float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return vld1q_f32(a); } +static INLINE VECTOR_CC float getfloat32x4_t(float32x4_t v, int r) FUNC_ATTR { float a[4]; vst1q_f32(a, v); return unifyValuef(a[r & 3]); } #endif #ifdef __ARM_FEATURE_SVE -static INLINE svfloat64_t setsvfloat64_t(double d, int r) { double a[svcntd()]; memrand(a, sizeof(a)); a[r & (svcntd()-1)] = d; return svld1_f64(svptrue_b8(), a); } -static INLINE double getsvfloat64_t(svfloat64_t v, int r) { double a[svcntd()]; svst1_f64(svptrue_b8(), a, v); return unifyValue(a[r & (svcntd()-1)]); } -static INLINE svfloat32_t setsvfloat32_t(float d, int r) { float a[svcntw()]; memrand(a, sizeof(a)); a[r & (svcntw()-1)] = d; return svld1_f32(svptrue_b8(), a); } -static INLINE float getsvfloat32_t(svfloat32_t v, int r) { float a[svcntw()]; svst1_f32(svptrue_b8(), a, v); return unifyValuef(a[r & (svcntw()-1)]); } - -static svfloat64_t vd2getx_vd_vd2(svfloat64x2_t v) { return svget2_f64(v, 0); } -static svfloat64_t vd2gety_vd_vd2(svfloat64x2_t v) { return svget2_f64(v, 1); } -static svfloat32_t vf2getx_vf_vf2(svfloat32x2_t v) { return svget2_f32(v, 0); } -static svfloat32_t vf2gety_vf_vf2(svfloat32x2_t v) { return svget2_f32(v, 1); } +static INLINE svfloat64_t setsvfloat64_t(double d, int r) FUNC_ATTR { double a[svcntd()]; memrand(a, sizeof(a)); a[r & (svcntd()-1)] = d; return svld1_f64(svptrue_b8(), a); } +static INLINE double getsvfloat64_t(svfloat64_t v, int r) FUNC_ATTR { double a[svcntd()]; svst1_f64(svptrue_b8(), a, v); return unifyValue(a[r & (svcntd()-1)]); } +static INLINE svfloat32_t setsvfloat32_t(float d, int r) FUNC_ATTR { float a[svcntw()]; memrand(a, sizeof(a)); a[r & (svcntw()-1)] = d; return svld1_f32(svptrue_b8(), a); } +static INLINE float getsvfloat32_t(svfloat32_t v, int r) FUNC_ATTR { float a[svcntw()]; svst1_f32(svptrue_b8(), a, v); return unifyValuef(a[r & (svcntw()-1)]); } + +static svfloat64_t vd2getx_vd_vd2(svfloat64x2_t v) FUNC_ATTR { return svget2_f64(v, 0); } +static svfloat64_t vd2gety_vd_vd2(svfloat64x2_t v) FUNC_ATTR { return svget2_f64(v, 1); } +static svfloat32_t vf2getx_vf_vf2(svfloat32x2_t v) FUNC_ATTR { return svget2_f32(v, 0); } +static svfloat32_t vf2gety_vf_vf2(svfloat32x2_t v) FUNC_ATTR { return svget2_f32(v, 1); } #endif #ifdef __VSX__ -static INLINE __vector double setSLEEF_VECTOR_DOUBLE(double d, int r) { double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return vec_vsx_ld(0, a); } -static INLINE double getSLEEF_VECTOR_DOUBLE(__vector double v, int r) { double a[2]; vec_vsx_st(v, 0, a); return unifyValue(a[r & 1]); } -static INLINE __vector float setSLEEF_VECTOR_FLOAT(float d, int r) { float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return vec_vsx_ld(0, a); } -static INLINE float getSLEEF_VECTOR_FLOAT(__vector float v, int r) { float a[4]; vec_vsx_st(v, 0, a); return unifyValuef(a[r & 3]); } +static INLINE __vector double setSLEEF_VECTOR_DOUBLE(double d, int r) FUNC_ATTR { double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return vec_vsx_ld(0, a); } +static INLINE double getSLEEF_VECTOR_DOUBLE(__vector double v, int r) FUNC_ATTR { double a[2]; vec_vsx_st(v, 0, a); return unifyValue(a[r & 1]); } +static INLINE __vector float setSLEEF_VECTOR_FLOAT(float d, int r) FUNC_ATTR { float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return vec_vsx_ld(0, a); } +static INLINE float getSLEEF_VECTOR_FLOAT(__vector float v, int r) FUNC_ATTR { float a[4]; vec_vsx_st(v, 0, a); return unifyValuef(a[r & 3]); } #endif #ifdef __VX__ -static INLINE __attribute__((vector_size(16))) double setSLEEF_VECTOR_DOUBLE(double d, int r) { double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return (__attribute__((vector_size(16))) double) { a[0], a[1] }; } -static INLINE double getSLEEF_VECTOR_DOUBLE(__attribute__((vector_size(16))) double v, int r) { return unifyValue(v[r & 1]); } -static INLINE __attribute__((vector_size(16))) float setSLEEF_VECTOR_FLOAT(float d, int r) { float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return (__attribute__((vector_size(16))) float) { a[0], a[1], a[2], a[3] }; } -static INLINE float getSLEEF_VECTOR_FLOAT(__attribute__((vector_size(16))) float v, int r) { return unifyValuef(v[r & 3]); } +static INLINE __attribute__((vector_size(16))) double setSLEEF_VECTOR_DOUBLE(double d, int r) FUNC_ATTR { double a[2]; memrand(a, sizeof(a)); a[r & 1] = d; return (__attribute__((vector_size(16))) double) FUNC_ATTR { a[0], a[1] }; } +static INLINE double getSLEEF_VECTOR_DOUBLE(__attribute__((vector_size(16))) double v, int r) FUNC_ATTR { return unifyValue(v[r & 1]); } +static INLINE __attribute__((vector_size(16))) float setSLEEF_VECTOR_FLOAT(float d, int r) FUNC_ATTR { float a[4]; memrand(a, sizeof(a)); a[r & 3] = d; return (__attribute__((vector_size(16))) float) FUNC_ATTR { a[0], a[1], a[2], a[3] }; } +static INLINE float getSLEEF_VECTOR_FLOAT(__attribute__((vector_size(16))) float v, int r) FUNC_ATTR { return unifyValuef(v[r & 3]); } #endif #if __riscv && __riscv_v @@ -107,29 +115,29 @@ static INLINE float getSLEEF_VECTOR_FLOAT(__attribute__((vector_size(16))) float #define VECTLENSP (1 * __riscv_vlenb() / sizeof(float)) #define VECTLENDP (1 * __riscv_vlenb() / sizeof(double)) -static INLINE vfloat32m1_t setvfloat32m1_t(float d, int r) { float a[VECTLENSP]; memrand(a, sizeof(a)); a[r & (VECTLENSP-1)] = d; return __riscv_vle32_v_f32m1(a, VECTLENSP); } -static INLINE float getvfloat32m1_t(vfloat32m1_t v, int r) { float a[VECTLENSP]; __riscv_vse32(a, v, VECTLENSP); return unifyValuef(a[r & (VECTLENSP-1)]); } -static INLINE vfloat64m1_t setvfloat64m1_t(double d, int r) { double a[VECTLENDP]; memrand(a, sizeof(a)); a[r & (VECTLENDP-1)] = d; return __riscv_vle64_v_f64m1(a, VECTLENDP); } -static INLINE double getvfloat64m1_t(vfloat64m1_t v, int r) { double a[VECTLENDP]; __riscv_vse64(a, v, VECTLENDP); return unifyValue(a[r & (VECTLENDP-1)]); } +static INLINE vfloat32m1_t setvfloat32m1_t(float d, int r) FUNC_ATTR { float a[VECTLENSP]; memrand(a, sizeof(a)); a[r & (VECTLENSP-1)] = d; return __riscv_vle32_v_f32m1(a, VECTLENSP); } +static INLINE float getvfloat32m1_t(vfloat32m1_t v, int r) FUNC_ATTR { float a[VECTLENSP]; __riscv_vse32(a, v, VECTLENSP); return unifyValuef(a[r & (VECTLENSP-1)]); } +static INLINE vfloat64m1_t setvfloat64m1_t(double d, int r) FUNC_ATTR { double a[VECTLENDP]; memrand(a, sizeof(a)); a[r & (VECTLENDP-1)] = d; return __riscv_vle64_v_f64m1(a, VECTLENDP); } +static INLINE double getvfloat64m1_t(vfloat64m1_t v, int r) FUNC_ATTR { double a[VECTLENDP]; __riscv_vse64(a, v, VECTLENDP); return unifyValue(a[r & (VECTLENDP-1)]); } -static vfloat32m1_t vf2getx_vf_vf2(vfloat32m2_t v) { return __riscv_vget_f32m1(v, 0); } -static vfloat32m1_t vf2gety_vf_vf2(vfloat32m2_t v) { return __riscv_vget_f32m1(v, 1); } -static vfloat64m1_t vd2getx_vd_vd2(vfloat64m2_t v) { return __riscv_vget_f64m1(v, 0); } -static vfloat64m1_t vd2gety_vd_vd2(vfloat64m2_t v) { return __riscv_vget_f64m1(v, 1); } +static vfloat32m1_t vf2getx_vf_vf2(vfloat32m2_t v) FUNC_ATTR { return __riscv_vget_f32m1(v, 0); } +static vfloat32m1_t vf2gety_vf_vf2(vfloat32m2_t v) FUNC_ATTR { return __riscv_vget_f32m1(v, 1); } +static vfloat64m1_t vd2getx_vd_vd2(vfloat64m2_t v) FUNC_ATTR { return __riscv_vget_f64m1(v, 0); } +static vfloat64m1_t vd2gety_vd_vd2(vfloat64m2_t v) FUNC_ATTR { return __riscv_vget_f64m1(v, 1); } #elif defined(ENABLE_RVVM2) #define VECTLENSP (2 * __riscv_vlenb() / sizeof(float)) #define VECTLENDP (2 * __riscv_vlenb() / sizeof(double)) -static INLINE vfloat32m2_t setvfloat32m2_t(float d, int r) { float a[VECTLENSP]; memrand(a, sizeof(a)); a[r & (VECTLENSP-1)] = d; return __riscv_vle32_v_f32m2(a, VECTLENSP); } -static INLINE float getvfloat32m2_t(vfloat32m2_t v, int r) { float a[VECTLENSP]; __riscv_vse32(a, v, VECTLENSP); return unifyValuef(a[r & (VECTLENSP-1)]); } -static INLINE vfloat64m2_t setvfloat64m2_t(double d, int r) { double a[VECTLENDP]; memrand(a, sizeof(a)); a[r & (VECTLENDP-1)] = d; return __riscv_vle64_v_f64m2(a, VECTLENDP); } -static INLINE double getvfloat64m2_t(vfloat64m2_t v, int r) { double a[VECTLENDP]; __riscv_vse64(a, v, VECTLENDP); return unifyValue(a[r & (VECTLENDP-1)]); } +static INLINE vfloat32m2_t setvfloat32m2_t(float d, int r) FUNC_ATTR { float a[VECTLENSP]; memrand(a, sizeof(a)); a[r & (VECTLENSP-1)] = d; return __riscv_vle32_v_f32m2(a, VECTLENSP); } +static INLINE float getvfloat32m2_t(vfloat32m2_t v, int r) FUNC_ATTR { float a[VECTLENSP]; __riscv_vse32(a, v, VECTLENSP); return unifyValuef(a[r & (VECTLENSP-1)]); } +static INLINE vfloat64m2_t setvfloat64m2_t(double d, int r) FUNC_ATTR { double a[VECTLENDP]; memrand(a, sizeof(a)); a[r & (VECTLENDP-1)] = d; return __riscv_vle64_v_f64m2(a, VECTLENDP); } +static INLINE double getvfloat64m2_t(vfloat64m2_t v, int r) FUNC_ATTR { double a[VECTLENDP]; __riscv_vse64(a, v, VECTLENDP); return unifyValue(a[r & (VECTLENDP-1)]); } -static vfloat32m2_t vf2getx_vf_vf2(vfloat32m4_t v) { return __riscv_vget_f32m2(v, 0); } -static vfloat32m2_t vf2gety_vf_vf2(vfloat32m4_t v) { return __riscv_vget_f32m2(v, 1); } -static vfloat64m2_t vd2getx_vd_vd2(vfloat64m4_t v) { return __riscv_vget_f64m2(v, 0); } -static vfloat64m2_t vd2gety_vd_vd2(vfloat64m4_t v) { return __riscv_vget_f64m2(v, 1); } +static vfloat32m2_t vf2getx_vf_vf2(vfloat32m4_t v) FUNC_ATTR { return __riscv_vget_f32m2(v, 0); } +static vfloat32m2_t vf2gety_vf_vf2(vfloat32m4_t v) FUNC_ATTR { return __riscv_vget_f32m2(v, 1); } +static vfloat64m2_t vd2getx_vd_vd2(vfloat64m4_t v) FUNC_ATTR { return __riscv_vget_f64m2(v, 0); } +static vfloat64m2_t vd2gety_vd_vd2(vfloat64m4_t v) FUNC_ATTR { return __riscv_vget_f64m2(v, 1); } #else #error "unknown RVV" @@ -149,10 +157,10 @@ static vfloat64m2_t vd2gety_vd_vd2(vfloat64m4_t v) { return __riscv_vget_f64m2(v #define GET(TYPE) get ## TYPE #if !defined(__ARM_FEATURE_SVE) && !(defined(__riscv) && defined(__riscv_v)) -static DPTYPE vd2getx_vd_vd2(TYPE2(DPTYPE) v) { return v.x; } -static DPTYPE vd2gety_vd_vd2(TYPE2(DPTYPE) v) { return v.y; } -static SPTYPE vf2getx_vf_vf2(TYPE2(SPTYPE) v) { return v.x; } -static SPTYPE vf2gety_vf_vf2(TYPE2(SPTYPE) v) { return v.y; } +static DPTYPE vd2getx_vd_vd2(TYPE2(DPTYPE) v) FUNC_ATTR { return v.x; } +static DPTYPE vd2gety_vd_vd2(TYPE2(DPTYPE) v) FUNC_ATTR { return v.y; } +static SPTYPE vf2getx_vf_vf2(TYPE2(SPTYPE) v) FUNC_ATTR { return v.x; } +static SPTYPE vf2gety_vf_vf2(TYPE2(SPTYPE) v) FUNC_ATTR { return v.y; } #endif // @@ -343,7 +351,7 @@ static SPTYPE vf2gety_vf_vf2(TYPE2(SPTYPE) v) { return v.y; } #define try_feature(TYPE, ATR_, TSX, EXT, arg) \ GET(TYPE) (FUNC(ATR_, pow, TSX, u10, EXT) (SET(TYPE) (arg, 0), SET(TYPE) (arg, 0)), 0) -int check_feature(double d, float f) { +int check_feature(double d, float f) TEST_FUNC_ATTR { d = try_feature(DPTYPE, ATR, DPTYPESPEC, EXTSPEC, d); return d == d; } @@ -352,8 +360,7 @@ int check_feature(double d, float f) { int success = 1; -int main2(int argc, char **argv) -{ +int main2(int argc, char **argv) TEST_FUNC_ATTR { FILE *fp = NULL; if (argc != 1) { diff --git a/src/libm/CMakeLists.txt b/src/libm/CMakeLists.txt index 2329e3324b428e3cee6cdecd8bf371903be9d791..7c6cef61e4d077b61115efc2f65dc31fb2f1e64f 100644 --- a/src/libm/CMakeLists.txt +++ b/src/libm/CMakeLists.txt @@ -25,6 +25,7 @@ elseif(SLEEF_ARCH_AARCH64) ADVSIMDNOFMA SVE SVENOFMA + SVESTREAM PUREC_SCALAR PURECFMA_SCALAR DSP_SCALAR @@ -90,6 +91,7 @@ command_arguments(HEADER_PARAMS_ADVSIMD finz_ 2 4 float64x2_t float32x4_ command_arguments(HEADER_PARAMS_ADVSIMDNOFMA cinz_ 2 4 float64x2_t float32x4_t int32x2_t int32x4_t __ARM_NEON advsimdnofma) command_arguments(HEADER_PARAMS_SVE finz_ x x svfloat64_t svfloat32_t svint32_t svint32_t __ARM_FEATURE_SVE sve) command_arguments(HEADER_PARAMS_SVENOFMA cinz_ x x svfloat64_t svfloat32_t svint32_t svint32_t __ARM_FEATURE_SVE svenofma) +command_arguments(HEADER_PARAMS_SVESTREAM finz_ x x svfloat64_t svfloat32_t svint32_t svint32_t __ARM_FEATURE_SME svestream) command_arguments(HEADER_PARAMS_NEON32_ - 2 4 - float32x4_t int32x2_t int32x4_t __ARM_NEON__) command_arguments(HEADER_PARAMS_NEON32 cinz_ 2 4 - float32x4_t int32x2_t int32x4_t __ARM_NEON__ neon) @@ -147,6 +149,7 @@ command_arguments(RENAME_PARAMS_CUDA finz_ 1 1 cuda) # the "x" token of VLA SVE vector functions. command_arguments(RENAME_PARAMS_SVE finz_ x x sve) command_arguments(RENAME_PARAMS_SVENOFMA cinz_ x x svenofma) +command_arguments(RENAME_PARAMS_SVESTREAM finz_ x x svestream) command_arguments(RENAME_PARAMS_GNUABI_SSE2 sse2 b 2 4 _mm128d _mm128 _mm128i _mm128i __SSE2__) command_arguments(RENAME_PARAMS_GNUABI_AVX avx c 4 8 __m256d __m256 __m128i "struct { __m128i x, y$ }" __AVX__) @@ -156,7 +159,8 @@ command_arguments(RENAME_PARAMS_GNUABI_ADVSIMD advsimd n 2 4 float64x2_t float3 # The vector length parameters in SVE, for SP and DP, are chosen for # the smallest SVE vector size (128-bit). The name is generated using # the "x" token of VLA SVE vector functions. -command_arguments(RENAME_PARAMS_GNUABI_SVE sve s x x svfloat64_t svfloat32_t svint32_t svint32_t __ARM_SVE) +command_arguments(RENAME_PARAMS_GNUABI_SVE sve s x x svfloat64_t svfloat32_t svint32_t svint32_t __ARM_FEATURE_SVE) +command_arguments(RENAME_PARAMS_GNUABI_SVESTREAM sve c x x svfloat64_t svfloat32_t svint32_t svint32_t __ARM_FEATURE_SME) command_arguments(RENAME_PARAMS_RVVM1 finz_ x x rvvm1) command_arguments(RENAME_PARAMS_RVVM1NOFMA cinz_ x x rvvm1nofma) diff --git a/src/libm/mkalias.c b/src/libm/mkalias.c index 633786d7e0a2bc48ae75357f36219ae9a175f405..f3d98fc4281b858faf636241119ae588e0e1741a 100644 --- a/src/libm/mkalias.c +++ b/src/libm/mkalias.c @@ -25,12 +25,14 @@ int main(int argc, char **argv) { int genAliasVectorABI = (mangledisa[0] != '-'); char *isaname = argc == 6 ? argv[5] : ""; - char * vectorcc=""; + char * funcattr=""; #ifdef ENABLE_AAVPCS if (strcmp(isaname, "advsimd") == 0) - vectorcc =" __attribute__((aarch64_vector_pcs))"; + funcattr =" __attribute__((aarch64_vector_pcs))"; genAliasVectorABI = 0; #endif + if (strcmp(isaname, "svestream") == 0) + funcattr =" __arm_streaming_compatible"; static char *argType2[] = { "a0", "a0, a1", "a0", "a0, a1", @@ -91,14 +93,14 @@ int main(int argc, char **argv) { returnType[funcList[i].funcType], funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, argType0[funcList[i].funcType], - funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, isaname, vectorcc + funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, isaname, funcattr ); if (genAliasVectorABI && vparameterStr[funcList[i].funcType] != NULL) { - printf("EXPORT CONST VECTOR_CC %s _ZGV%sN%d%s_Sleef_%s%s_u%02d(%s) __attribute__((alias(\"Sleef_%s%s%d_u%02d%s\")))%s;\n", + printf("EXPORT CONST %s _ZGV%sN%d%s_Sleef_%s%s_u%02d(%s) __attribute__((alias(\"Sleef_%s%s%d_u%02d%s\")))%s;\n", returnType[funcList[i].funcType], mangledisa, vw, vparameterStr[funcList[i].funcType], funcList[i].name, typeSpecS[fptype], funcList[i].ulp, argType0[funcList[i].funcType], - funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, isaname, vectorcc + funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, isaname, funcattr ); } } else { @@ -106,14 +108,14 @@ int main(int argc, char **argv) { returnType[funcList[i].funcType], funcList[i].name, typeSpec[fptype], vw, argType0[funcList[i].funcType], - funcList[i].name, typeSpec[fptype], vw, isaname, vectorcc + funcList[i].name, typeSpec[fptype], vw, isaname, funcattr ); if (genAliasVectorABI && vparameterStr[funcList[i].funcType] != NULL) { - printf("EXPORT CONST VECTOR_CC %s _ZGV%sN%d%s_Sleef_%s%s(%s) __attribute__((alias(\"Sleef_%s%s%d_%s\")))%s;\n", + printf("EXPORT CONST %s _ZGV%sN%d%s_Sleef_%s%s(%s) __attribute__((alias(\"Sleef_%s%s%d_%s\")))%s;\n", returnType[funcList[i].funcType], mangledisa, vw, vparameterStr[funcList[i].funcType], funcList[i].name, typeSpecS[fptype], argType0[funcList[i].funcType], - funcList[i].name, typeSpec[fptype], vw, isaname, vectorcc + funcList[i].name, typeSpec[fptype], vw, isaname, funcattr ); } } @@ -129,7 +131,7 @@ int main(int argc, char **argv) { if (fptype == 0 && (funcList[i].flags & 2) != 0) continue; if (funcList[i].ulp >= 0) { printf("EXPORT CONST %s %s Sleef_%s%s%d_u%02d(%s) { return Sleef_%s%s%d_u%02d%s(%s); }\n", - returnType[funcList[i].funcType], vectorcc, + returnType[funcList[i].funcType], funcattr, funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, argType1[funcList[i].funcType], funcList[i].name, typeSpec[fptype], vw, funcList[i].ulp, isaname, @@ -137,7 +139,7 @@ int main(int argc, char **argv) { ); } else { printf("EXPORT CONST %s %s Sleef_%s%s%d(%s) { return Sleef_%s%s%d_%s(%s); }\n", - returnType[funcList[i].funcType], vectorcc, + returnType[funcList[i].funcType], funcattr, funcList[i].name, typeSpec[fptype], vw, argType1[funcList[i].funcType], funcList[i].name, typeSpec[fptype], vw, isaname, diff --git a/src/libm/mkrename.c b/src/libm/mkrename.c index edae03cda68ac5ced458259693f374f811fc6a66..719eb61401c76eda878171340b8c1b1120288008 100644 --- a/src/libm/mkrename.c +++ b/src/libm/mkrename.c @@ -148,11 +148,13 @@ int main(int argc, char **argv) { if (strcmp(isaname, "sve") == 0) wdp = wsp = "x"; - char * vectorcc = ""; + char * funcattr = ""; #ifdef ENABLE_AAVPCS if (strcmp(isaname, "advsimd") == 0) - vectorcc =" __attribute__((aarch64_vector_pcs))"; + funcattr =" __attribute__((aarch64_vector_pcs))"; #endif + if (strcmp(isaname, "svestream") == 0) + funcattr = " __arm_streaming_compatible"; printf("#ifdef %s\n", architecture); @@ -189,7 +191,7 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s_u%02d%s(%s)%s;\n", vdoublename, @@ -197,7 +199,7 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, - vectorcc); + funcattr); } } else { printf("%sSLEEF_IMPORT SLEEF_CONST %s Sleef_%sd%s%s%s(%s)%s;\n", @@ -206,7 +208,7 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s%s%s(%s)%s;\n", vdoublename, @@ -214,7 +216,7 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, - vectorcc); + funcattr); } } break; @@ -226,14 +228,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s_u%02d%s(%s, %s)%s;\n", vdoublename, atrPrefix, funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, vdoublename, - vectorcc); + funcattr); } } else { printf("%sSLEEF_IMPORT SLEEF_CONST %s Sleef_%sd%s%s%s(%s, %s)%s;\n", @@ -242,14 +244,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s%s%s(%s, %s)%s;\n", vdoublename, atrPrefix, funcList[i].name, wdp, isaub, isaname, vdoublename, vdoublename, - vectorcc); + funcattr); } } break; @@ -261,14 +263,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST Sleef_%s_2 Sleef_%s%sd%s_u%02d%s(%s)%s;\n", vdoublename_escspace, atrPrefix, funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, - vectorcc); + funcattr); } } else { printf("SLEEF_IMPORT SLEEF_CONST Sleef_%s_2 Sleef_%sd%s%s%s(%s)%s;\n", @@ -276,14 +278,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST Sleef_%s_2 Sleef_%s%sd%s%s%s(%s)%s;\n", vdoublename_escspace, atrPrefix, funcList[i].name, wdp, isaub, isaname, vdoublename, - vectorcc); + funcattr); } } break; @@ -294,14 +296,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, vintname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s_u%02d%s(%s, %s)%s;\n", vdoublename, atrPrefix, funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, vintname, - vectorcc); + funcattr); } } else { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%sd%s%s%s(%s, %s)%s;\n", @@ -309,14 +311,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, vintname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s%s%s(%s, %s)%s;\n", vdoublename, atrPrefix, funcList[i].name, wdp, isaub, isaname, vdoublename, vintname, - vectorcc); + funcattr); } } break; @@ -327,14 +329,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s_u%02d%s(%s)%s;\n", vintname, atrPrefix, funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, - vectorcc); + funcattr); } } else { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%sd%s%s%s(%s)%s;\n", @@ -342,14 +344,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s%s%s(%s)%s;\n", vintname, atrPrefix, funcList[i].name, wdp, isaub, isaname, vdoublename, - vectorcc); + funcattr); } } break; @@ -361,14 +363,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, vdoublename, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s_u%02d%s(%s, %s, %s)%s;\n", vdoublename, atrPrefix, funcList[i].name, wdp, funcList[i].ulp, isaname, vdoublename, vdoublename, vdoublename, - vectorcc); + funcattr); } } else { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%sd%s%s%s(%s, %s, %s)%s;\n", @@ -376,14 +378,14 @@ int main(int argc, char **argv) { funcList[i].name, wdp, isaub, isaname, vdoublename, vdoublename, vdoublename, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sd%s%s%s(%s, %s, %s)%s;\n", vdoublename, atrPrefix, funcList[i].name, wdp, isaub, isaname, vdoublename, vdoublename, vdoublename, - vectorcc); + funcattr); } } break; @@ -391,6 +393,10 @@ int main(int argc, char **argv) { // They do not have vector type as argument or return value. // Also, the corresponding definition (`getPtr` and `getInt`) in `sleefsimd*.c` // are not defined with `VECTOR_CC`. (Same for single precision case below) + // + // What about other attributes? should we separate vectorcc and funcattr? + // Is this a good enough reason? For now we try to just pass no attribute. + // Maybe we should just have a branch here? case 7: printf("SLEEF_IMPORT SLEEF_CONST int Sleef_%sd%s%s%s(int);\n", funcList[i].name, wdp, isaub, isaname); @@ -438,14 +444,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sf%s_u%02d%s(%s)%s;\n", vfloatname, atrPrefix, funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, - vectorcc); + funcattr); } } else { printf("%sSLEEF_IMPORT SLEEF_CONST %s Sleef_%sf%s%s%s(%s)%s;\n", @@ -454,14 +460,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, isaub, isaname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sf%s%s%s(%s)%s;\n", vfloatname, atrPrefix, funcList[i].name, wsp, isaub, isaname, vfloatname, - vectorcc); + funcattr); } } break; @@ -472,14 +478,14 @@ int main(int argc, char **argv) { vfloatname, funcList[i].name, wsp, funcList[i].ulp, isaname, - vfloatname, vfloatname, vectorcc); + vfloatname, vfloatname, funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sf%s_u%02d%s(%s, %s)%s;\n", vfloatname, atrPrefix, funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, vfloatname, - vectorcc); + funcattr); } } else { printf("%sSLEEF_IMPORT SLEEF_CONST %s Sleef_%sf%s%s%s(%s, %s)%s;\n", @@ -488,14 +494,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, isaub, isaname, vfloatname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sf%s%s%s(%s, %s)%s;\n", vfloatname, atrPrefix, funcList[i].name, wsp, isaub, isaname, vfloatname, vfloatname, - vectorcc); + funcattr); } } break; @@ -507,14 +513,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST Sleef_%s_2 Sleef_%s%sf%s_u%02d%s(%s)%s;\n", vfloatname_escspace, atrPrefix, funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, - vectorcc); + funcattr); } } else { printf("SLEEF_IMPORT SLEEF_CONST Sleef_%s_2 Sleef_%sf%s%s%s(%s)%s;\n", @@ -522,14 +528,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, isaub, isaname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST Sleef_%s_2 Sleef_%s%sf%s%s%s(%s)%s;\n", vfloatname_escspace, atrPrefix, funcList[i].name, wsp, isaub, isaname, vfloatname, - vectorcc); + funcattr); } } break; @@ -557,14 +563,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, vfloatname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sf%s_u%02d%s(%s, %s, %s)%s;\n", vfloatname, atrPrefix, funcList[i].name, wsp, funcList[i].ulp, isaname, vfloatname, vfloatname, vfloatname, - vectorcc); + funcattr); } } else { printf("%sSLEEF_IMPORT SLEEF_CONST %s Sleef_%sf%s%s%s(%s, %s, %s)%s;\n", @@ -573,14 +579,14 @@ int main(int argc, char **argv) { funcList[i].name, wsp, isaub, isaname, vfloatname, vfloatname, vfloatname, - vectorcc); + funcattr); if (atrPrefix != NULL) { printf("SLEEF_IMPORT SLEEF_CONST %s Sleef_%s%sf%s%s%s(%s, %s, %s)%s;\n", vfloatname, atrPrefix, funcList[i].name, wsp, isaub, isaname, vfloatname, vfloatname, vfloatname, - vectorcc); + funcattr); } } break; diff --git a/src/libm/sleefinline_header.h.org b/src/libm/sleefinline_header.h.org index a2cb471e7f6aa278bbd14cb0112a2d3d176c1a3f..f267035f204ef21a01f2d3ea5fb8f992f8ef8815 100644 --- a/src/libm/sleefinline_header.h.org +++ b/src/libm/sleefinline_header.h.org @@ -18,6 +18,16 @@ #define SLEEF_INLINE static inline #endif +#ifndef SLEEF_NOINLINE +#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER) +#define SLEEF_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define SLEEF_NOINLINE __declspec(noinline) +#else +#define SLEEF_NOINLINE +#endif +#endif + #ifndef SLEEF_CONST #define SLEEF_CONST #endif diff --git a/src/libm/sleeflibm_header.h.org.in b/src/libm/sleeflibm_header.h.org.in index 66d88db3c702591a1b28cf78b7c5ebf6e673b01b..7ea19240aa12dcd8ec75d0806dde4681a1d1e14b 100644 --- a/src/libm/sleeflibm_header.h.org.in +++ b/src/libm/sleeflibm_header.h.org.in @@ -63,6 +63,10 @@ #include #endif +#if defined(__ARM_FEATURE_SME) +#include +#endif + #if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) #include typedef __vector double SLEEF_VECTOR_DOUBLE; diff --git a/src/libm/sleefsimddp.c b/src/libm/sleefsimddp.c index 8186ff321fb3adb6e225d54f5b93ed5850e96d5b..fda80c2e2d78c6cdadc1d08d257aca95e6151f14 100644 --- a/src/libm/sleefsimddp.c +++ b/src/libm/sleefsimddp.c @@ -155,6 +155,18 @@ extern const double Sleef_rempitabdp[]; #endif /* DORENAME */ #endif /* ENABLE_SVE */ +#ifdef ENABLE_SVESTREAM +#define CONFIG 3 +#include "helpersve.h" +#ifdef DORENAME +#ifdef ENABLE_GNUABI +#include "renamesvestream_gnuabi.h" +#else +#include "renamesvestream.h" +#endif /* ENABLE_GNUABI */ +#endif /* DORENAME */ +#endif /* ENABLE_SVESTREAM */ + // IBM #ifdef ENABLE_VSX @@ -312,16 +324,16 @@ extern const double Sleef_rempitabdp[]; #include "commonfuncs.h" // return d0 < d1 ? x : y -static INLINE CONST VECTOR_CC vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { return vsel_vi_vo_vi_vi(vcast_vo32_vo64(vlt_vo_vd_vd(d0, d1)), x, y); } +static INLINE CONST VECTOR_CC vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) FUNC_ATTR { return vsel_vi_vo_vi_vi(vcast_vo32_vo64(vlt_vo_vd_vd(d0, d1)), x, y); } // return d0 < 0 ? x : 0 -static INLINE CONST VECTOR_CC vint vsel_vi_vd_vi(vdouble d, vint x) { return vand_vi_vo_vi(vcast_vo32_vo64(vsignbit_vo_vd(d)), x); } +static INLINE CONST VECTOR_CC vint vsel_vi_vd_vi(vdouble d, vint x) FUNC_ATTR { return vand_vi_vo_vi(vcast_vo32_vo64(vsignbit_vo_vd(d)), x); } // -EXPORT CONST VECTOR_CC vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); } +EXPORT CONST VECTOR_CC vdouble xldexp(vdouble x, vint q) FUNC_ATTR { return vldexp_vd_vd_vi(x, q); } -EXPORT CONST VECTOR_CC vint xilogb(vdouble d) { +EXPORT CONST VECTOR_CC vint xilogb(vdouble d) FUNC_ATTR { vdouble e = vcast_vd_vi(vilogbk_vi_vd(vabs_vd_vd(d))); e = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_FP_ILOGB0), e); e = vsel_vd_vo_vd_vd(visnan_vo_vd(d), vcast_vd_d(SLEEF_FP_ILOGBNAN), e); @@ -329,7 +341,7 @@ EXPORT CONST VECTOR_CC vint xilogb(vdouble d) { return vrint_vi_vd(e); } -static INLINE CONST ddi_t rempi(vdouble a) { +static INLINE CONST ddi_t rempi_core(vdouble a) { vdouble2 x, y; vint ex = vilogb2k_vi_vd(a); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) @@ -363,7 +375,16 @@ static INLINE CONST ddi_t rempi(vdouble a) { return ddisetddi_ddi_vd2_vi(x, q); } -EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) { +#if defined(ENABLE_SVESTREAM) +// noinline to prevent spills causing LSRT hazards +static NOINLINE CONST ddi_t rempi(vdouble a) FUNC_ATTR { + return rempi_stream(a, &rempi_core); +} +#else +#define rempi rempi_core +#endif + +EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) // The SIMD source files(sleefsimd?p.c) are compiled twice for each // vector extension, with DETERMINISTIC macro turned on and off. @@ -502,7 +523,7 @@ EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x; @@ -633,7 +654,7 @@ EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vdouble u, s, r = d; vint ql; @@ -768,7 +789,7 @@ EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x; @@ -923,7 +944,7 @@ EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) { #define XMODF xmodf #endif -TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS(vdouble d) { +TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vopmask o; vdouble u, t, rx, ry, s; @@ -1067,7 +1088,7 @@ TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS(vdouble d) { #endif // #if !defined(DETERMINISTIC) } -TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS_U1(vdouble d) { +TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS_U1(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vopmask o; vdouble u, rx, ry; @@ -1226,7 +1247,7 @@ TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS_U1(vdouble d) { } #if !defined(DETERMINISTIC) -TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U05(vdouble d) { +TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U05(vdouble d) FUNC_ATTR { vopmask o; vdouble u, s, t, rx, ry; vdouble2 r, x, s2; @@ -1292,7 +1313,7 @@ TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U05(vdouble d) { return r; } -TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U35(vdouble d) { +TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U35(vdouble d) FUNC_ATTR { vopmask o; vdouble u, s, t, rx, ry; vdouble2 r; @@ -1352,7 +1373,7 @@ TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U35(vdouble d) { return r; } -TYPE6_FUNCATR VECTOR_CC vdouble2 XMODF(vdouble x) { +TYPE6_FUNCATR VECTOR_CC vdouble2 XMODF(vdouble x) FUNC_ATTR { vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); fr = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), vcast_vd_d(0), fr); @@ -1365,31 +1386,31 @@ TYPE6_FUNCATR VECTOR_CC vdouble2 XMODF(vdouble x) { } #ifdef ENABLE_GNUABI -EXPORT VECTOR_CC void xsincos(vdouble a, double *ps, double *pc) { +EXPORT VECTOR_CC void xsincos(vdouble a, double *ps, double *pc) FUNC_ATTR { vdouble2 r = sincosk(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } -EXPORT VECTOR_CC void xsincos_u1(vdouble a, double *ps, double *pc) { +EXPORT VECTOR_CC void xsincos_u1(vdouble a, double *ps, double *pc) FUNC_ATTR { vdouble2 r = sincosk_u1(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } -EXPORT VECTOR_CC void xsincospi_u05(vdouble a, double *ps, double *pc) { +EXPORT VECTOR_CC void xsincospi_u05(vdouble a, double *ps, double *pc) FUNC_ATTR { vdouble2 r = sincospik_u05(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } -EXPORT VECTOR_CC void xsincospi_u35(vdouble a, double *ps, double *pc) { +EXPORT VECTOR_CC void xsincospi_u35(vdouble a, double *ps, double *pc) FUNC_ATTR { vdouble2 r = sincospik_u35(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } -EXPORT CONST VECTOR_CC vdouble xmodf(vdouble a, double *iptr) { +EXPORT CONST VECTOR_CC vdouble xmodf(vdouble a, double *iptr) FUNC_ATTR { vdouble2 r = modfk(a); vstoreu_v_p_vd(iptr, vd2gety_vd_vd2(r)); return vd2getx_vd_vd2(r); @@ -1397,7 +1418,7 @@ EXPORT CONST VECTOR_CC vdouble xmodf(vdouble a, double *iptr) { #endif // #ifdef ENABLE_GNUABI #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vdouble2 sinpik(vdouble d) { +static INLINE CONST VECTOR_CC vdouble2 sinpik(vdouble d) FUNC_ATTR { vopmask o; vdouble u, s, t; vdouble2 x, s2; @@ -1437,7 +1458,7 @@ static INLINE CONST VECTOR_CC vdouble2 sinpik(vdouble d) { return x; } -EXPORT CONST VECTOR_CC vdouble xsinpi_u05(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xsinpi_u05(vdouble d) FUNC_ATTR { vdouble2 x = sinpik(d); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); @@ -1448,7 +1469,7 @@ EXPORT CONST VECTOR_CC vdouble xsinpi_u05(vdouble d) { return r; } -static INLINE CONST VECTOR_CC vdouble2 cospik(vdouble d) { +static INLINE CONST VECTOR_CC vdouble2 cospik(vdouble d) FUNC_ATTR { vopmask o; vdouble u, s, t; vdouble2 x, s2; @@ -1488,7 +1509,7 @@ static INLINE CONST VECTOR_CC vdouble2 cospik(vdouble d) { return x; } -EXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) FUNC_ATTR { vdouble2 x = cospik(d); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); @@ -1498,7 +1519,7 @@ EXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) { return r; } -EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vdouble u, s, x, y; vopmask o; @@ -1626,7 +1647,7 @@ EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) FUNC_ATTR { #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x, y; @@ -1772,7 +1793,7 @@ EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { #endif // #if !defined(DETERMINISTIC) } -static INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) { +static INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) FUNC_ATTR { vdouble s, t, u; vint q; vopmask p; @@ -1816,7 +1837,7 @@ static INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) { return t; } -static INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) { +static INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) FUNC_ATTR { vdouble u; vdouble2 s, t; vint q; @@ -1867,11 +1888,11 @@ static INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) { return t; } -static INLINE CONST VECTOR_CC vdouble visinf2_vd_vd_vd(vdouble d, vdouble m) { +static INLINE CONST VECTOR_CC vdouble visinf2_vd_vd_vd(vdouble d, vdouble m) FUNC_ATTR { return vreinterpret_vd_vm(vand_vm_vo64_vm(visinf_vo_vd(d), vor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(m)))); } -EXPORT CONST VECTOR_CC vdouble xatan2(vdouble y, vdouble x) { +EXPORT CONST VECTOR_CC vdouble xatan2(vdouble y, vdouble x) FUNC_ATTR { vdouble r = atan2k(vabs_vd_vd(y), x); r = vmulsign_vd_vd_vd(r, x); @@ -1883,7 +1904,7 @@ EXPORT CONST VECTOR_CC vdouble xatan2(vdouble y, vdouble x) { return r; } -EXPORT CONST VECTOR_CC vdouble xatan2_u1(vdouble y, vdouble x) { +EXPORT CONST VECTOR_CC vdouble xatan2_u1(vdouble y, vdouble x) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(5.5626846462680083984e-309)); // nexttoward((1.0 / DBL_MAX), 1) x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 53)), x); y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(UINT64_C(1) << 53)), y); @@ -1900,7 +1921,7 @@ EXPORT CONST VECTOR_CC vdouble xatan2_u1(vdouble y, vdouble x) { return r; } -EXPORT CONST VECTOR_CC vdouble xasin(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xasin(vdouble d) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))); vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)), u; @@ -1926,7 +1947,7 @@ EXPORT CONST VECTOR_CC vdouble xasin(vdouble d) { return vmulsign_vd_vd_vd(r, d); } -EXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u; vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2)); @@ -1956,7 +1977,7 @@ EXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) { return vmulsign_vd_vd_vd(r, d); } -EXPORT CONST VECTOR_CC vdouble xacos(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xacos(vdouble d) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u; @@ -1988,7 +2009,7 @@ EXPORT CONST VECTOR_CC vdouble xacos(vdouble d) { vneg_vd_vd(r))), r); } -EXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) FUNC_ATTR { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u; vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2)); @@ -2023,14 +2044,14 @@ EXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) { return vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)); } -EXPORT CONST VECTOR_CC vdouble xatan_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xatan_u1(vdouble d) FUNC_ATTR { vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2)); r = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(1.570796326794896557998982), r); return vmulsign_vd_vd_vd(r, d); } -EXPORT CONST VECTOR_CC vdouble xatan(vdouble s) { +EXPORT CONST VECTOR_CC vdouble xatan(vdouble s) FUNC_ATTR { vdouble t, u; vint q; #if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR) @@ -2080,7 +2101,7 @@ EXPORT CONST VECTOR_CC vdouble xatan(vdouble s) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xlog(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xlog(vdouble d) FUNC_ATTR { vdouble x, x2; vdouble t, m; @@ -2127,7 +2148,7 @@ EXPORT CONST VECTOR_CC vdouble xlog(vdouble d) { } #endif // #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) FUNC_ATTR { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s; vint q = vrint_vi_vd(u); @@ -2177,7 +2198,7 @@ EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) { return u; } -static INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) { +static INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) FUNC_ATTR { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s; vint q = vrint_vi_vd(u); @@ -2205,7 +2226,7 @@ static INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) { return u; } -static INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) { +static INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) FUNC_ATTR { vdouble2 x, x2, s; vdouble t, m; @@ -2252,7 +2273,7 @@ static INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) FUNC_ATTR { vdouble2 x; vdouble t, m, x2; @@ -2304,7 +2325,7 @@ EXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) FUNC_ATTR { vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2)); vdouble dq = vrint_vd_vd(u); vint q = vrint_vi_vd(dq); @@ -2340,7 +2361,7 @@ static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) FUNC_ATTR { vopmask yisint = visint_vo_vd(y); vopmask yisodd = vand_vo_vo_vo(visodd_vo_vd(y), yisint); @@ -2376,7 +2397,7 @@ EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) FUNC_ATTR { vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2)); vdouble dq = vrint_vd_vd(u); vint q = vrint_vi_vd(dq); @@ -2414,7 +2435,7 @@ static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) FUNC_ATTR { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); @@ -2427,7 +2448,7 @@ EXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) { return y; } -EXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) FUNC_ATTR { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); @@ -2439,7 +2460,7 @@ EXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) { return y; } -EXPORT CONST VECTOR_CC vdouble xtanh(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xtanh(vdouble x) FUNC_ATTR { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); vdouble2 e = ddrec_vd2_vd2(d); @@ -2453,7 +2474,7 @@ EXPORT CONST VECTOR_CC vdouble xtanh(vdouble x) { return y; } -EXPORT CONST VECTOR_CC vdouble xsinh_u35(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xsinh_u35(vdouble x) FUNC_ATTR { vdouble e = expm1k(vabs_vd_vd(x)); vdouble y = vdiv_vd_vd_vd(vadd_vd_vd_vd(e, vcast_vd_d(2)), vadd_vd_vd_vd(e, vcast_vd_d(1))); @@ -2466,7 +2487,7 @@ EXPORT CONST VECTOR_CC vdouble xsinh_u35(vdouble x) { return y; } -EXPORT CONST VECTOR_CC vdouble xcosh_u35(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xcosh_u35(vdouble x) FUNC_ATTR { vdouble e = xexp(vabs_vd_vd(x)); vdouble y = vmla_vd_vd_vd_vd(vcast_vd_d(0.5), e, vdiv_vd_vd_vd(vcast_vd_d(0.5), e)); @@ -2476,7 +2497,7 @@ EXPORT CONST VECTOR_CC vdouble xcosh_u35(vdouble x) { return y; } -EXPORT CONST VECTOR_CC vdouble xtanh_u35(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xtanh_u35(vdouble x) FUNC_ATTR { vdouble d = expm1k(vmul_vd_vd_vd(vcast_vd_d(2), vabs_vd_vd(x))); vdouble y = vdiv_vd_vd_vd(d, vadd_vd_vd_vd(vcast_vd_d(2), d)); @@ -2487,7 +2508,7 @@ EXPORT CONST VECTOR_CC vdouble xtanh_u35(vdouble x) { return y; } -static INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) { +static INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) FUNC_ATTR { vdouble2 x, x2, m, s; vdouble t; vint e; @@ -2518,7 +2539,7 @@ static INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) { return s; } -EXPORT CONST VECTOR_CC vdouble xasinh(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xasinh(vdouble x) FUNC_ATTR { vdouble y = vabs_vd_vd(x); vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1)); vdouble2 d; @@ -2540,7 +2561,7 @@ EXPORT CONST VECTOR_CC vdouble xasinh(vdouble x) { return y; } -EXPORT CONST VECTOR_CC vdouble xacosh(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xacosh(vdouble x) FUNC_ATTR { vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x)); vdouble y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); @@ -2549,25 +2570,23 @@ EXPORT CONST VECTOR_CC vdouble xacosh(vdouble x) { vcast_vd_d(SLEEF_INFINITY), y); y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y))); - y = vreinterpret_vd_vm(vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y))); - y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); - + y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), visnan_vo_vd(x)), vreinterpret_vm_vd(y))); + return y; } -EXPORT CONST VECTOR_CC vdouble xatanh(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xatanh(vdouble x) FUNC_ATTR { vdouble y = vabs_vd_vd(x); vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y)))); y = vreinterpret_vd_vm(vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(SLEEF_INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5)))))); y = vmulsign_vd_vd_vd(y, x); - y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(y))); - y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); + y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), visnan_vo_vd(x)), vreinterpret_vm_vd(y))); return y; } -EXPORT CONST VECTOR_CC vdouble xcbrt(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xcbrt(vdouble d) FUNC_ATTR { vdouble x, y, q = vcast_vd_d(1.0); vint e, qu, re; vdouble t; @@ -2609,7 +2628,7 @@ EXPORT CONST VECTOR_CC vdouble xcbrt(vdouble d) { return y; } -EXPORT CONST VECTOR_CC vdouble xcbrt_u1(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xcbrt_u1(vdouble d) FUNC_ATTR { vdouble x, y, z, t; vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v; vint e, qu, re; @@ -2665,7 +2684,7 @@ EXPORT CONST VECTOR_CC vdouble xcbrt_u1(vdouble d) { } #endif // #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) FUNC_ATTR { vdouble u = vrint_vd_vd(d), s; vint q = vrint_vi_vd(u); @@ -2699,7 +2718,7 @@ EXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) { return u; } -EXPORT CONST VECTOR_CC vdouble xexp2_u35(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xexp2_u35(vdouble d) FUNC_ATTR { vdouble u = vrint_vd_vd(d), s; vint q = vrint_vi_vd(u); @@ -2729,7 +2748,7 @@ EXPORT CONST VECTOR_CC vdouble xexp2_u35(vdouble d) { return u; } -EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) FUNC_ATTR { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s; vint q = vrint_vi_vd(u); @@ -2762,7 +2781,7 @@ EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) { return u; } -EXPORT CONST VECTOR_CC vdouble xexp10_u35(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xexp10_u35(vdouble d) FUNC_ATTR { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s; vint q = vrint_vi_vd(u); @@ -2794,7 +2813,7 @@ EXPORT CONST VECTOR_CC vdouble xexp10_u35(vdouble d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vdouble xexpm1(vdouble a) { +EXPORT CONST VECTOR_CC vdouble xexpm1(vdouble a) FUNC_ATTR { vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0)); vdouble x = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(SLEEF_INFINITY), x); @@ -2803,7 +2822,7 @@ EXPORT CONST VECTOR_CC vdouble xexpm1(vdouble a) { return x; } -EXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) FUNC_ATTR { vdouble2 x; vdouble t, m, x2; @@ -2854,7 +2873,7 @@ EXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) { return r; } -EXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) FUNC_ATTR { vdouble2 x; vdouble t, m, x2; @@ -2906,7 +2925,7 @@ EXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) { return r; } -EXPORT CONST VECTOR_CC vdouble xlog2_u35(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xlog2_u35(vdouble d) FUNC_ATTR { vdouble m, t, x, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) @@ -2953,7 +2972,7 @@ EXPORT CONST VECTOR_CC vdouble xlog2_u35(vdouble d) { return r; } -EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) FUNC_ATTR { vdouble2 x; vdouble t, m, x2; @@ -2995,7 +3014,12 @@ EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) { // Use log(d) if d too large to use core approximation. vopmask ocore = vle_vo_vd_vd(d, vcast_vd_d(LOG1P_BOUND)); +#if defined(ENABLE_SVESTREAM) + r = vsel_vd_vo_vd_vd(ocore, r, xlog_u1(d)); +#else if(!LIKELY(vtestallones_i_vo64 (ocore))) r = vsel_vd_vo_vd_vd(ocore, r, xlog_u1(d)); +#endif + r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(-1)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(-1)), vcast_vd_d(-SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r); @@ -3005,11 +3029,11 @@ EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) { // -EXPORT CONST VECTOR_CC vdouble xfabs(vdouble x) { return vabs_vd_vd(x); } +EXPORT CONST VECTOR_CC vdouble xfabs(vdouble x) FUNC_ATTR { return vabs_vd_vd(x); } -EXPORT CONST VECTOR_CC vdouble xcopysign(vdouble x, vdouble y) { return vcopysign_vd_vd_vd(x, y); } +EXPORT CONST VECTOR_CC vdouble xcopysign(vdouble x, vdouble y) FUNC_ATTR { return vcopysign_vd_vd_vd(x, y); } -EXPORT CONST VECTOR_CC vdouble xfmax(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xfmax(vdouble x, vdouble y) FUNC_ATTR { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmax_vd_vd_vd(x, y)); #else @@ -3017,7 +3041,7 @@ EXPORT CONST VECTOR_CC vdouble xfmax(vdouble x, vdouble y) { #endif } -EXPORT CONST VECTOR_CC vdouble xfmin(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xfmin(vdouble x, vdouble y) FUNC_ATTR { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmin_vd_vd_vd(x, y)); #else @@ -3025,19 +3049,19 @@ EXPORT CONST VECTOR_CC vdouble xfmin(vdouble x, vdouble y) { #endif } -EXPORT CONST VECTOR_CC vdouble xfdim(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xfdim(vdouble x, vdouble y) FUNC_ATTR { vdouble ret = vsub_vd_vd_vd(x, y); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(ret, vcast_vd_d(0)), veq_vo_vd_vd(x, y)), vcast_vd_d(0), ret); return ret; } -EXPORT CONST VECTOR_CC vdouble xtrunc(vdouble x) { return vtruncate2_vd_vd(x); } -EXPORT CONST VECTOR_CC vdouble xfloor(vdouble x) { return vfloor2_vd_vd(x); } -EXPORT CONST VECTOR_CC vdouble xceil(vdouble x) { return vceil2_vd_vd(x); } -EXPORT CONST VECTOR_CC vdouble xround(vdouble x) { return vround2_vd_vd(x); } -EXPORT CONST VECTOR_CC vdouble xrint(vdouble x) { return vrint2_vd_vd(x); } +EXPORT CONST VECTOR_CC vdouble xtrunc(vdouble x) FUNC_ATTR { return vtruncate2_vd_vd(x); } +EXPORT CONST VECTOR_CC vdouble xfloor(vdouble x) FUNC_ATTR { return vfloor2_vd_vd(x); } +EXPORT CONST VECTOR_CC vdouble xceil(vdouble x) FUNC_ATTR { return vceil2_vd_vd(x); } +EXPORT CONST VECTOR_CC vdouble xround(vdouble x) FUNC_ATTR { return vround2_vd_vd(x); } +EXPORT CONST VECTOR_CC vdouble xrint(vdouble x) FUNC_ATTR { return vrint2_vd_vd(x); } -EXPORT CONST VECTOR_CC vdouble xnextafter(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xnextafter(vdouble x, vdouble y) FUNC_ATTR { x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), y), x); vmask xi2 = vreinterpret_vm_vd(x); vopmask c = vxor_vo_vo_vo(vsignbit_vo_vd(x), vge_vo_vd_vd(y, x)); @@ -3060,7 +3084,7 @@ EXPORT CONST VECTOR_CC vdouble xnextafter(vdouble x, vdouble y) { return ret; } -EXPORT CONST VECTOR_CC vdouble xfrfrexp(vdouble x) { +EXPORT CONST VECTOR_CC vdouble xfrfrexp(vdouble x) FUNC_ATTR { x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SLEEF_DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x); vmask xm = vreinterpret_vm_vd(x); @@ -3075,7 +3099,7 @@ EXPORT CONST VECTOR_CC vdouble xfrfrexp(vdouble x) { return ret; } -EXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) { +EXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) FUNC_ATTR { x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SLEEF_DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x); vint ret = vcastu_vi_vm(vreinterpret_vm_vd(x)); @@ -3086,7 +3110,7 @@ EXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) { return ret; } -EXPORT CONST VECTOR_CC vdouble xfma(vdouble x, vdouble y, vdouble z) { +EXPORT CONST VECTOR_CC vdouble xfma(vdouble x, vdouble y, vdouble z) FUNC_ATTR { #ifdef ENABLE_FMA_DP return vfma_vd_vd_vd_vd(x, y, z); #else @@ -3123,7 +3147,7 @@ EXPORT CONST VECTOR_CC vdouble xfma(vdouble x, vdouble y, vdouble z) { #endif } -SQRTU05_FUNCATR VECTOR_CC vdouble xsqrt_u05(vdouble d) { +SQRTU05_FUNCATR VECTOR_CC vdouble xsqrt_u05(vdouble d) FUNC_ATTR { #if defined(ENABLE_FMA_DP) vdouble q, w, x, y, z; @@ -3192,7 +3216,7 @@ SQRTU05_FUNCATR VECTOR_CC vdouble xsqrt_u05(vdouble d) { #endif } -EXPORT CONST VECTOR_CC vdouble xsqrt(vdouble d) { +EXPORT CONST VECTOR_CC vdouble xsqrt(vdouble d) FUNC_ATTR { #if defined(ACCURATE_SQRT) return vsqrt_vd_vd(d); #else @@ -3201,9 +3225,9 @@ EXPORT CONST VECTOR_CC vdouble xsqrt(vdouble d) { #endif } -EXPORT CONST VECTOR_CC vdouble xsqrt_u35(vdouble d) { return xsqrt_u05(d); } +EXPORT CONST VECTOR_CC vdouble xsqrt_u35(vdouble d) FUNC_ATTR { return xsqrt_u05(d); } -EXPORT CONST VECTOR_CC vdouble xhypot_u05(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xhypot_u05(vdouble x, vdouble y) FUNC_ATTR { x = vabs_vd_vd(x); y = vabs_vd_vd(y); vdouble min = vmin_vd_vd_vd(x, y), n = min; @@ -3224,7 +3248,7 @@ EXPORT CONST VECTOR_CC vdouble xhypot_u05(vdouble x, vdouble y) { return ret; } -EXPORT CONST VECTOR_CC vdouble xhypot_u35(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xhypot_u35(vdouble x, vdouble y) FUNC_ATTR { x = vabs_vd_vd(x); y = vabs_vd_vd(y); vdouble min = vmin_vd_vd_vd(x, y); @@ -3239,7 +3263,7 @@ EXPORT CONST VECTOR_CC vdouble xhypot_u35(vdouble x, vdouble y) { return ret; } -static INLINE CONST VECTOR_CC vdouble vptrunc_vd_vd(vdouble x) { // round to integer toward 0, positive argument only +static INLINE CONST VECTOR_CC vdouble vptrunc_vd_vd(vdouble x) FUNC_ATTR { // round to integer toward 0, positive argument only #ifdef FULL_FP_ROUNDING return vtruncate_vd_vd(x); #else @@ -3250,7 +3274,7 @@ static INLINE CONST VECTOR_CC vdouble vptrunc_vd_vd(vdouble x) { // round to int } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ -EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) FUNC_ATTR { vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q; vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(SLEEF_DBL_MIN)); n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n); @@ -3285,7 +3309,7 @@ EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) { return ret; } -static INLINE VECTOR_CC vdouble vrintk2_vd_vd(vdouble d) { +static INLINE VECTOR_CC vdouble vrintk2_vd_vd(vdouble d) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vrint_vd_vd(d); #else @@ -3295,7 +3319,7 @@ static INLINE VECTOR_CC vdouble vrintk2_vd_vd(vdouble d) { #endif } -EXPORT CONST VECTOR_CC vdouble xremainder(vdouble x, vdouble y) { +EXPORT CONST VECTOR_CC vdouble xremainder(vdouble x, vdouble y) FUNC_ATTR { vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q; vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(SLEEF_DBL_MIN*2)); n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n); @@ -3328,7 +3352,7 @@ EXPORT CONST VECTOR_CC vdouble xremainder(vdouble x, vdouble y) { } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ -static CONST dd2 gammak(vdouble a) { +static CONST dd2 gammak(vdouble a) FUNC_ATTR { vdouble2 clc = vcast_vd2_d_d(0, 0), clln = vcast_vd2_d_d(1, 0), clld = vcast_vd2_d_d(1, 0); vdouble2 x, y, z; vdouble t, u; @@ -3408,7 +3432,7 @@ static CONST dd2 gammak(vdouble a) { return dd2setab_dd2_vd2_vd2(clc, dddiv_vd2_vd2_vd2(clln, clld)); } -EXPORT CONST VECTOR_CC vdouble xtgamma_u1(vdouble a) { +EXPORT CONST VECTOR_CC vdouble xtgamma_u1(vdouble a) FUNC_ATTR { dd2 d = gammak(a); vdouble2 y = ddmul_vd2_vd2_vd2(expk2(dd2geta_vd2_dd2(d)), dd2getb_vd2_dd2(d)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)); @@ -3427,7 +3451,7 @@ EXPORT CONST VECTOR_CC vdouble xtgamma_u1(vdouble a) { return r; } -EXPORT CONST VECTOR_CC vdouble xlgamma_u1(vdouble a) { +EXPORT CONST VECTOR_CC vdouble xlgamma_u1(vdouble a) FUNC_ATTR { dd2 d = gammak(a); vdouble2 y = ddadd2_vd2_vd2_vd2(dd2geta_vd2_dd2(d), logk2(ddabs_vd2_vd2(dd2getb_vd2_dd2(d)))); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)); @@ -3441,17 +3465,17 @@ EXPORT CONST VECTOR_CC vdouble xlgamma_u1(vdouble a) { return r; } -static INLINE CONST vdouble2 ddmla_vd2_vd_vd2_vd2(vdouble x, vdouble2 y, vdouble2 z) { +static INLINE CONST vdouble2 ddmla_vd2_vd_vd2_vd2(vdouble x, vdouble2 y, vdouble2 z) FUNC_ATTR { return ddadd_vd2_vd2_vd2(z, ddmul_vd2_vd2_vd(y, x)); } -static INLINE CONST VECTOR_CC vdouble2 poly2dd_b(vdouble x, vdouble2 c1, vdouble2 c0) { return ddmla_vd2_vd_vd2_vd2(x, c1, c0); } -static INLINE CONST VECTOR_CC vdouble2 poly2dd(vdouble x, vdouble c1, vdouble2 c0) { return ddmla_vd2_vd_vd2_vd2(x, vcast_vd2_vd_vd(c1, vcast_vd_d(0)), c0); } -static INLINE CONST VECTOR_CC vdouble2 poly4dd(vdouble x, vdouble c3, vdouble2 c2, vdouble2 c1, vdouble2 c0) { +static INLINE CONST VECTOR_CC vdouble2 poly2dd_b(vdouble x, vdouble2 c1, vdouble2 c0) FUNC_ATTR { return ddmla_vd2_vd_vd2_vd2(x, c1, c0); } +static INLINE CONST VECTOR_CC vdouble2 poly2dd(vdouble x, vdouble c1, vdouble2 c0) FUNC_ATTR { return ddmla_vd2_vd_vd2_vd2(x, vcast_vd2_vd_vd(c1, vcast_vd_d(0)), c0); } +static INLINE CONST VECTOR_CC vdouble2 poly4dd(vdouble x, vdouble c3, vdouble2 c2, vdouble2 c1, vdouble2 c0) FUNC_ATTR { return ddmla_vd2_vd_vd2_vd2(vmul_vd_vd_vd(x, x), poly2dd(x, c3, c2), poly2dd_b(x, c1, c0)); } -EXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) { +EXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) FUNC_ATTR { vdouble t, x = vabs_vd_vd(a); vdouble2 t2; vdouble x2 = vmul_vd_vd_vd(x, x), x4 = vmul_vd_vd_vd(x2, x2); @@ -3537,8 +3561,7 @@ EXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) { vdouble z = vneg_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(t2), vd2gety_vd_vd2(t2))); z = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(x, vcast_vd_d(1e-8)), vmul_vd_vd_vd(x, vcast_vd_d(1.12837916709551262756245475959)), z); - z = vsel_vd_vo_vd_vd(vge_vo_vd_vd(x, vcast_vd_d(6)), vcast_vd_d(1), z); - z = vsel_vd_vo_vd_vd(visinf_vo_vd(a), vcast_vd_d(1), z); + z = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vge_vo_vd_vd(x, vcast_vd_d(6)), visinf_vo_vd(a)), vcast_vd_d(1), z); z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(a, vcast_vd_d(0)), vcast_vd_d(0), z); z = vmulsign_vd_vd_vd(z, a); @@ -3546,7 +3569,7 @@ EXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) { } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ -EXPORT CONST VECTOR_CC vdouble xerfc_u15(vdouble a) { +EXPORT CONST VECTOR_CC vdouble xerfc_u15(vdouble a) FUNC_ATTR { vdouble s = a, r = vcast_vd_d(0), t; vdouble2 u, d, x; a = vabs_vd_vd(a); @@ -3612,11 +3635,11 @@ EXPORT CONST VECTOR_CC vdouble xerfc_u15(vdouble a) { #define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #else -#define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble d) { return x ## FUNC (d); } -#define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble d) { return x ## FUNC (d); } -#define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble d) { return x ## FUNC (d); } -#define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y) { return x ## FUNC (x, y); } -#define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y, vdouble z) { return x ## FUNC (x, y, z); } +#define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble d) FUNC_ATTR { return x ## FUNC (d); } +#define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble d) FUNC_ATTR { return x ## FUNC (d); } +#define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble d) FUNC_ATTR { return x ## FUNC (d); } +#define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y) FUNC_ATTR { return x ## FUNC (x, y); } +#define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y, vdouble z) FUNC_ATTR { return x ## FUNC (x, y, z); } #endif DALIAS_vd2_vd(sincospi_u05) @@ -3668,12 +3691,12 @@ DALIAS_vd_vd(erfc_u15) #endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) #if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) -EXPORT CONST int xgetInt(int name) { +EXPORT CONST int xgetInt(int name) FUNC_ATTR { if (1 <= name && name <= 10) return vavailability_i(name); return 0; } -EXPORT CONST void *xgetPtr(int name) { +EXPORT CONST void *xgetPtr(int name) FUNC_ATTR { if (name == 0) return ISANAME; return (void *)0; } diff --git a/src/libm/sleefsimdsp.c b/src/libm/sleefsimdsp.c index 27a33f98950f7e9fabce956e3ef86e1b5ea530f1..c66645e9e3458eb183fa83d48b44e7bfc6fae3d6 100644 --- a/src/libm/sleefsimdsp.c +++ b/src/libm/sleefsimdsp.c @@ -223,6 +223,22 @@ extern const float Sleef_rempitabsp[]; #endif /* DORENAME */ #endif /* ENABLE_SVE */ +#ifdef ENABLE_SVESTREAM +#define CONFIG 3 +#if !defined(SLEEF_GENHEADER) +#include "helpersve.h" +#else +#include "macroonlySVESTREAM.h" +#endif +#ifdef DORENAME +#ifdef ENABLE_GNUABI +#include "renamesvestream_gnuabi.h" +#else +#include "renamesvestream.h" +#endif /* ENABLE_GNUABI */ +#endif /* DORENAME */ +#endif /* ENABLE_SVESTREAM */ + // IBM #ifdef ENABLE_VSX @@ -442,51 +458,51 @@ extern const float Sleef_rempitabsp[]; #include "df.h" -static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vf(vfloat d) { +static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vf(vfloat d) FUNC_ATTR { return veq_vo_vi2_vi2(vreinterpret_vi2_vf(d), vreinterpret_vi2_vf(vcast_vf_f(-0.0))); } -static INLINE VECTOR_CC vopmask vnot_vo32_vo32(vopmask x) { +static INLINE VECTOR_CC vopmask vnot_vo32_vo32(vopmask x) FUNC_ATTR { return vxor_vo_vo_vo(x, veq_vo_vi2_vi2(vcast_vi2_i(0), vcast_vi2_i(0))); } -static INLINE CONST VECTOR_CC vmask vsignbit_vm_vf(vfloat f) { +static INLINE CONST VECTOR_CC vmask vsignbit_vm_vf(vfloat f) FUNC_ATTR { return vand_vm_vm_vm(vreinterpret_vm_vf(f), vreinterpret_vm_vf(vcast_vf_f(-0.0f))); } #if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) -static INLINE CONST VECTOR_CC vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y))); } -static INLINE CONST VECTOR_CC vfloat vcopysign_vf_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat vcopysign_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return vreinterpret_vf_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(x)), vand_vm_vm_vm (vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(y)))); } -static INLINE CONST VECTOR_CC vfloat vsign_vf_vf(vfloat f) { +static INLINE CONST VECTOR_CC vfloat vsign_vf_vf(vfloat f) FUNC_ATTR { return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(1.0f)), vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f)))); } #endif -static INLINE CONST VECTOR_CC vopmask vsignbit_vo_vf(vfloat d) { +static INLINE CONST VECTOR_CC vopmask vsignbit_vo_vf(vfloat d) FUNC_ATTR { return veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0x80000000)), vcast_vi2_i(0x80000000)); } -static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) { +static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) FUNC_ATTR { return vsel_vi2_vo_vi2_vi2(vlt_vo_vf_vf(f0, f1), x, y); } -static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vi2(vfloat d, vint2 x) { +static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vi2(vfloat d, vint2 x) FUNC_ATTR { return vand_vi2_vo_vi2(vsignbit_vo_vf(d), x); } -static INLINE CONST VECTOR_CC vopmask visint_vo_vf(vfloat y) { return veq_vo_vf_vf(vtruncate_vf_vf(y), y); } +static INLINE CONST VECTOR_CC vopmask visint_vo_vf(vfloat y) FUNC_ATTR { return veq_vo_vf_vf(vtruncate_vf_vf(y), y); } -static INLINE CONST VECTOR_CC vopmask visnumber_vo_vf(vfloat x) { return vnot_vo32_vo32(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x))); } +static INLINE CONST VECTOR_CC vopmask visnumber_vo_vf(vfloat x) FUNC_ATTR { return vnot_vo32_vo32(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x))); } #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) -static INLINE CONST VECTOR_CC vint2 vilogbk_vi2_vf(vfloat d) { +static INLINE CONST VECTOR_CC vint2 vilogbk_vi2_vf(vfloat d) FUNC_ATTR { vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.421010862427522E-20f)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d); vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 23), vcast_vi2_i(0xff)); @@ -494,7 +510,7 @@ static INLINE CONST VECTOR_CC vint2 vilogbk_vi2_vf(vfloat d) { return q; } -static INLINE CONST VECTOR_CC vint2 vilogb2k_vi2_vf(vfloat d) { +static INLINE CONST VECTOR_CC vint2 vilogb2k_vi2_vf(vfloat d) FUNC_ATTR { vint2 q = vreinterpret_vi2_vf(d); q = vsrl_vi2_vi2_i(q, 23); q = vand_vi2_vi2_vi2(q, vcast_vi2_i(0xff)); @@ -505,7 +521,7 @@ static INLINE CONST VECTOR_CC vint2 vilogb2k_vi2_vf(vfloat d) { // -EXPORT CONST VECTOR_CC vint2 xilogbf(vfloat d) { +EXPORT CONST VECTOR_CC vint2 xilogbf(vfloat d) FUNC_ATTR { vint2 e = vilogbk_vi2_vf(vabs_vf_vf(d)); e = vsel_vi2_vo_vi2_vi2(veq_vo_vf_vf(d, vcast_vf_f(0.0f)), vcast_vi2_i(SLEEF_FP_ILOGB0), e); e = vsel_vi2_vo_vi2_vi2(visnan_vo_vf(d), vcast_vi2_i(SLEEF_FP_ILOGBNAN), e); @@ -513,11 +529,11 @@ EXPORT CONST VECTOR_CC vint2 xilogbf(vfloat d) { return e; } -static INLINE CONST VECTOR_CC vfloat vpow2i_vf_vi2(vint2 q) { +static INLINE CONST VECTOR_CC vfloat vpow2i_vf_vi2(vint2 q) FUNC_ATTR { return vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)); } -static INLINE CONST VECTOR_CC vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) { +static INLINE CONST VECTOR_CC vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) FUNC_ATTR { vfloat u; vint2 m = vsra_vi2_vi2_i(q, 31); m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4); @@ -532,25 +548,25 @@ static INLINE CONST VECTOR_CC vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) { return vmul_vf_vf_vf(x, u); } -static INLINE CONST VECTOR_CC vfloat vldexp2_vf_vf_vi2(vfloat d, vint2 e) { +static INLINE CONST VECTOR_CC vfloat vldexp2_vf_vf_vi2(vfloat d, vint2 e) FUNC_ATTR { return vmul_vf_vf_vf(vmul_vf_vf_vf(d, vpow2i_vf_vi2(vsra_vi2_vi2_i(e, 1))), vpow2i_vf_vi2(vsub_vi2_vi2_vi2(e, vsra_vi2_vi2_i(e, 1)))); } -static INLINE CONST VECTOR_CC vfloat vldexp3_vf_vf_vi2(vfloat d, vint2 q) { +static INLINE CONST VECTOR_CC vfloat vldexp3_vf_vf_vi2(vfloat d, vint2 q) FUNC_ATTR { return vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vsll_vi2_vi2_i(q, 23))); } -EXPORT CONST VECTOR_CC vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); } +EXPORT CONST VECTOR_CC vfloat xldexpf(vfloat x, vint2 q) FUNC_ATTR { return vldexp_vf_vf_vi2(x, q); } -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) typedef struct { vfloat d; vint2 i; } fi_t; -static vfloat figetd_vf_di(fi_t d) { return d.d; } -static vint2 figeti_vi2_di(fi_t d) { return d.i; } -static fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) { +static vfloat figetd_vf_di(fi_t d) FUNC_ATTR { return d.d; } +static vint2 figeti_vi2_di(fi_t d) FUNC_ATTR { return d.i; } +static fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) FUNC_ATTR { fi_t r = { d, i }; return r; } @@ -560,25 +576,25 @@ typedef struct { vint2 i; } dfi_t; -static vfloat2 dfigetdf_vf2_dfi(dfi_t d) { return d.df; } -static vint2 dfigeti_vi2_dfi(dfi_t d) { return d.i; } -static dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) { +static vfloat2 dfigetdf_vf2_dfi(dfi_t d) FUNC_ATTR { return d.df; } +static vint2 dfigeti_vi2_dfi(dfi_t d) FUNC_ATTR { return d.i; } +static dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) FUNC_ATTR { dfi_t r = { v, i }; return r; } -static dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) { +static dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) FUNC_ATTR { dfi.df = v; return dfi; } #endif #if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) -static INLINE CONST VECTOR_CC vfloat vorsign_vf_vf_vf(vfloat x, vfloat y) { +static INLINE CONST VECTOR_CC vfloat vorsign_vf_vf_vf(vfloat x, vfloat y) FUNC_ATTR { return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y))); } #endif -static INLINE CONST fi_t rempisubf(vfloat x) { +static INLINE CONST fi_t rempisubf(vfloat x) FUNC_ATTR { #ifdef FULL_FP_ROUNDING vfloat y = vrint_vf_vf(vmul_vf_vf_vf(x, vcast_vf_f(4))); vint2 vi = vtruncate_vi2_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vrint_vf_vf(x), vcast_vf_f(4)))); @@ -595,7 +611,7 @@ static INLINE CONST fi_t rempisubf(vfloat x) { #endif } -static INLINE CONST dfi_t rempif(vfloat a) { +static INLINE CONST dfi_t rempif_core(vfloat a) { vfloat2 x, y; vint2 ex = vilogb2k_vi2_vf(a); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) @@ -627,7 +643,16 @@ static INLINE CONST dfi_t rempif(vfloat a) { return dfisetdfi_dfi_vf2_vi2(x, q); } -EXPORT CONST VECTOR_CC vfloat xsinf(vfloat d) { +#if defined(ENABLE_SVESTREAM) +// noinline to prevent spills causing LSRT hazards +static NOINLINE CONST dfi_t rempif(vfloat a) FUNC_ATTR { + return rempif_stream(a, &rempif_core); +} +#else +#define rempif rempif_core +#endif + +EXPORT CONST VECTOR_CC vfloat xsinf(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vfloat u, s, r = d; @@ -733,7 +758,7 @@ EXPORT CONST VECTOR_CC vfloat xsinf(vfloat d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vfloat xcosf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xcosf(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vfloat u, s, r = d; @@ -842,7 +867,7 @@ EXPORT CONST VECTOR_CC vfloat xcosf(vfloat d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vopmask o; @@ -972,7 +997,7 @@ EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vfloat xsinf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xsinf_u1(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vfloat u, v; @@ -1070,7 +1095,7 @@ EXPORT CONST VECTOR_CC vfloat xsinf_u1(vfloat d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vfloat xcosf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xcosf_u1(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vfloat u; @@ -1168,7 +1193,7 @@ EXPORT CONST VECTOR_CC vfloat xcosf_u1(vfloat d) { #endif // #if !defined(DETERMINISTIC) } -EXPORT CONST VECTOR_CC vfloat xfastsinf_u3500(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xfastsinf_u3500(vfloat d) FUNC_ATTR { vint2 q; vfloat u, s, t = d; @@ -1192,7 +1217,7 @@ EXPORT CONST VECTOR_CC vfloat xfastsinf_u3500(vfloat d) { return u; } -EXPORT CONST VECTOR_CC vfloat xfastcosf_u3500(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xfastcosf_u3500(vfloat d) FUNC_ATTR { vint2 q; vfloat u, s, t = d; @@ -1236,7 +1261,7 @@ EXPORT CONST VECTOR_CC vfloat xfastcosf_u3500(vfloat d) { #define XMODFF xmodff #endif -TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF(vfloat d) { +TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vopmask o; @@ -1363,7 +1388,7 @@ TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF(vfloat d) { #endif // #if !defined(DETERMINISTIC) } -TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF_U1(vfloat d) { +TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF_U1(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vopmask o; @@ -1480,7 +1505,7 @@ TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF_U1(vfloat d) { } #if !defined(DETERMINISTIC) -TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U05(vfloat d) { +TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U05(vfloat d) FUNC_ATTR { vopmask o; vfloat u, s, t, rx, ry; vfloat2 r, x, s2; @@ -1540,7 +1565,7 @@ TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U05(vfloat d) { return r; } -TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U35(vfloat d) { +TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U35(vfloat d) FUNC_ATTR { vopmask o; vfloat u, s, t, rx, ry; vfloat2 r; @@ -1594,7 +1619,7 @@ TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U35(vfloat d) { return r; } -TYPE6_FUNCATR VECTOR_CC vfloat2 XMODFF(vfloat x) { +TYPE6_FUNCATR VECTOR_CC vfloat2 XMODFF(vfloat x) FUNC_ATTR { vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); fr = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), vcast_vf_f(0), fr); @@ -1606,31 +1631,31 @@ TYPE6_FUNCATR VECTOR_CC vfloat2 XMODFF(vfloat x) { } #ifdef ENABLE_GNUABI -EXPORT VECTOR_CC void xsincosf(vfloat a, float *ps, float *pc) { +EXPORT VECTOR_CC void xsincosf(vfloat a, float *ps, float *pc) FUNC_ATTR { vfloat2 r = sincosfk(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } -EXPORT VECTOR_CC void xsincosf_u1(vfloat a, float *ps, float *pc) { +EXPORT VECTOR_CC void xsincosf_u1(vfloat a, float *ps, float *pc) FUNC_ATTR { vfloat2 r = sincosfk_u1(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } -EXPORT VECTOR_CC void xsincospif_u05(vfloat a, float *ps, float *pc) { +EXPORT VECTOR_CC void xsincospif_u05(vfloat a, float *ps, float *pc) FUNC_ATTR { vfloat2 r = sincospifk_u05(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } -EXPORT VECTOR_CC void xsincospif_u35(vfloat a, float *ps, float *pc) { +EXPORT VECTOR_CC void xsincospif_u35(vfloat a, float *ps, float *pc) FUNC_ATTR { vfloat2 r = sincospifk_u35(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } -EXPORT CONST VECTOR_CC vfloat xmodff(vfloat a, float *iptr) { +EXPORT CONST VECTOR_CC vfloat xmodff(vfloat a, float *iptr) FUNC_ATTR { vfloat2 r = modffk(a); vstoreu_v_p_vf(iptr, vf2gety_vf_vf2(r)); return vf2getx_vf_vf2(r); @@ -1638,7 +1663,7 @@ EXPORT CONST VECTOR_CC vfloat xmodff(vfloat a, float *iptr) { #endif // #ifdef ENABLE_GNUABI #endif // #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xtanf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xtanf_u1(vfloat d) FUNC_ATTR { #if !defined(DETERMINISTIC) vint2 q; vfloat u, v; @@ -1746,7 +1771,7 @@ EXPORT CONST VECTOR_CC vfloat xtanf_u1(vfloat d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) FUNC_ATTR { vfloat s, t, u; vint2 q; @@ -1783,7 +1808,7 @@ EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) { +static INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) FUNC_ATTR { vfloat s, t, u; vint2 q; vopmask p; @@ -1816,12 +1841,12 @@ static INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) { return t; } -static INLINE CONST VECTOR_CC vfloat visinf2_vf_vf_vf(vfloat d, vfloat m) { +static INLINE CONST VECTOR_CC vfloat visinf2_vf_vf_vf(vfloat d, vfloat m) FUNC_ATTR { return vreinterpret_vf_vm(vand_vm_vo32_vm(visinf_vo_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), vreinterpret_vm_vf(m)))); } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xatan2f(vfloat y, vfloat x) { +EXPORT CONST VECTOR_CC vfloat xatan2f(vfloat y, vfloat x) FUNC_ATTR { vfloat r = atan2kf(vabs_vf_vf(y), x); r = vmulsign_vf_vf_vf(r, x); @@ -1834,7 +1859,7 @@ EXPORT CONST VECTOR_CC vfloat xatan2f(vfloat y, vfloat x) { return r; } -EXPORT CONST VECTOR_CC vfloat xasinf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xasinf(vfloat d) FUNC_ATTR { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))); vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2)), u; @@ -1850,7 +1875,7 @@ EXPORT CONST VECTOR_CC vfloat xasinf(vfloat d) { return vmulsign_vf_vf_vf(r, d); } -EXPORT CONST VECTOR_CC vfloat xacosf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xacosf(vfloat d) FUNC_ATTR { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u; @@ -1875,7 +1900,7 @@ EXPORT CONST VECTOR_CC vfloat xacosf(vfloat d) { // -static INLINE CONST VECTOR_CC vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) { +static INLINE CONST VECTOR_CC vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) FUNC_ATTR { vfloat u; vfloat2 s, t; vint2 q; @@ -1914,7 +1939,7 @@ static INLINE CONST VECTOR_CC vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xatan2f_u1(vfloat y, vfloat x) { +EXPORT CONST VECTOR_CC vfloat xatan2f_u1(vfloat y, vfloat x) FUNC_ATTR { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(2.9387372783541830947e-39f)); // nexttowardf((1.0 / FLT_MAX), 1) x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1 << 24)), x); y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1 << 24)), y); @@ -1931,7 +1956,7 @@ EXPORT CONST VECTOR_CC vfloat xatan2f_u1(vfloat y, vfloat x) { return r; } -EXPORT CONST VECTOR_CC vfloat xasinf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xasinf_u1(vfloat d) FUNC_ATTR { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u; vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2)); @@ -1951,7 +1976,7 @@ EXPORT CONST VECTOR_CC vfloat xasinf_u1(vfloat d) { return vmulsign_vf_vf_vf(r, d); } -EXPORT CONST VECTOR_CC vfloat xacosf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xacosf_u1(vfloat d) FUNC_ATTR { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u; vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2)); @@ -1976,7 +2001,7 @@ EXPORT CONST VECTOR_CC vfloat xacosf_u1(vfloat d) { return vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)); } -EXPORT CONST VECTOR_CC vfloat xatanf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xatanf_u1(vfloat d) FUNC_ATTR { vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2)); r = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(1.570796326794896557998982), r); @@ -1987,7 +2012,7 @@ EXPORT CONST VECTOR_CC vfloat xatanf_u1(vfloat d) { // #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xlogf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xlogf(vfloat d) FUNC_ATTR { vfloat x, x2, t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) @@ -2026,7 +2051,7 @@ EXPORT CONST VECTOR_CC vfloat xlogf(vfloat d) { #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xexpf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xexpf(vfloat d) FUNC_ATTR { vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); vfloat s, u; @@ -2051,7 +2076,7 @@ EXPORT CONST VECTOR_CC vfloat xexpf(vfloat d) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) { +static INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) FUNC_ATTR { vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); vfloat s, u; @@ -2076,7 +2101,7 @@ static INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) { } #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) -EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) FUNC_ATTR { vfloat e = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1)))); vfloat m = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), vreinterpret_vi2_vf(d)))); float32x4_t x = vrsqrteq_f32(m); @@ -2093,17 +2118,17 @@ EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { return u; } #elif defined(ENABLE_VECEXT) -EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) FUNC_ATTR { vfloat q = vsqrt_vf_vf(d); q = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), q); return vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), q); } #else -EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { return vsqrt_vf_vf(d); } +EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) FUNC_ATTR { return vsqrt_vf_vf(d); } #endif #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xcbrtf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xcbrtf(vfloat d) FUNC_ATTR { vfloat x, y, q = vcast_vf_f(1.0), t; vint2 e, qu, re; @@ -2144,7 +2169,7 @@ EXPORT CONST VECTOR_CC vfloat xcbrtf(vfloat d) { #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xcbrtf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xcbrtf_u1(vfloat d) FUNC_ATTR { vfloat x, y, z, t; vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v; vint2 e, qu, re; @@ -2201,7 +2226,7 @@ EXPORT CONST VECTOR_CC vfloat xcbrtf_u1(vfloat d) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat2 logkf(vfloat d) { +static INLINE CONST VECTOR_CC vfloat2 logkf(vfloat d) FUNC_ATTR { vfloat2 x, x2; vfloat t, m; @@ -2237,7 +2262,7 @@ static INLINE CONST VECTOR_CC vfloat2 logkf(vfloat d) { return s; } -static INLINE CONST VECTOR_CC vfloat logk3f(vfloat d) { +static INLINE CONST VECTOR_CC vfloat logk3f(vfloat d) FUNC_ATTR { vfloat x, x2, t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) @@ -2271,7 +2296,7 @@ static INLINE CONST VECTOR_CC vfloat logk3f(vfloat d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xlogf_u1(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xlogf_u1(vfloat d) FUNC_ATTR { vfloat2 x; vfloat t, m, x2; @@ -2313,7 +2338,7 @@ EXPORT CONST VECTOR_CC vfloat xlogf_u1(vfloat d) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat expkf(vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat expkf(vfloat2 d) FUNC_ATTR { vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f)); vint2 q = vrint_vi2_vf(u); vfloat2 s, t; @@ -2340,7 +2365,7 @@ static INLINE CONST VECTOR_CC vfloat expkf(vfloat2 d) { return u; } -static INLINE CONST VECTOR_CC vfloat expk3f(vfloat d) { +static INLINE CONST VECTOR_CC vfloat expk3f(vfloat d) FUNC_ATTR { vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); vfloat s, u; @@ -2363,7 +2388,7 @@ static INLINE CONST VECTOR_CC vfloat expk3f(vfloat d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xpowf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xpowf(vfloat x, vfloat y) FUNC_ATTR { #if 1 vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24))); vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint), @@ -2406,7 +2431,7 @@ EXPORT CONST VECTOR_CC vfloat xpowf(vfloat x, vfloat y) { #endif } -EXPORT CONST VECTOR_CC vfloat xfastpowf_u3500(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xfastpowf_u3500(vfloat x, vfloat y) FUNC_ATTR { vfloat result = expk3f(vmul_vf_vf_vf(logk3f(vabs_vf_vf(x)), y)); vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24))); vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint), @@ -2421,7 +2446,7 @@ EXPORT CONST VECTOR_CC vfloat xfastpowf_u3500(vfloat x, vfloat y) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) FUNC_ATTR { vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f)); vint2 q = vrint_vi2_vf(u); vfloat2 s, t; @@ -2450,7 +2475,7 @@ static INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) FUNC_ATTR { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); @@ -2464,7 +2489,7 @@ EXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) { return y; } -EXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) FUNC_ATTR { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); @@ -2477,7 +2502,7 @@ EXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) { return y; } -EXPORT CONST VECTOR_CC vfloat xtanhf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xtanhf(vfloat x) FUNC_ATTR { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); vfloat2 e = dfrec_vf2_vf2(d); @@ -2492,7 +2517,7 @@ EXPORT CONST VECTOR_CC vfloat xtanhf(vfloat x) { return y; } -EXPORT CONST VECTOR_CC vfloat xsinhf_u35(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xsinhf_u35(vfloat x) FUNC_ATTR { vfloat e = expm1fk(vabs_vf_vf(x)); vfloat y = vdiv_vf_vf_vf(vadd_vf_vf_vf(e, vcast_vf_f(2)), vadd_vf_vf_vf(e, vcast_vf_f(1))); y = vmul_vf_vf_vf(y, vmul_vf_vf_vf(vcast_vf_f(0.5f), e)); @@ -2505,7 +2530,7 @@ EXPORT CONST VECTOR_CC vfloat xsinhf_u35(vfloat x) { return y; } -EXPORT CONST VECTOR_CC vfloat xcoshf_u35(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xcoshf_u35(vfloat x) FUNC_ATTR { vfloat e = xexpf(vabs_vf_vf(x)); vfloat y = vmla_vf_vf_vf_vf(vcast_vf_f(0.5f), e, vdiv_vf_vf_vf(vcast_vf_f(0.5), e)); @@ -2516,7 +2541,7 @@ EXPORT CONST VECTOR_CC vfloat xcoshf_u35(vfloat x) { return y; } -EXPORT CONST VECTOR_CC vfloat xtanhf_u35(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xtanhf_u35(vfloat x) FUNC_ATTR { vfloat d = expm1fk(vmul_vf_vf_vf(vcast_vf_f(2), vabs_vf_vf(x))); vfloat y = vdiv_vf_vf_vf(d, vadd_vf_vf_vf(vcast_vf_f(2), d)); @@ -2529,7 +2554,7 @@ EXPORT CONST VECTOR_CC vfloat xtanhf_u35(vfloat x) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat2 logk2f(vfloat2 d) { +static INLINE CONST VECTOR_CC vfloat2 logk2f(vfloat2 d) FUNC_ATTR { vfloat2 x, x2, m, s; vfloat t; vint2 e; @@ -2557,7 +2582,7 @@ static INLINE CONST VECTOR_CC vfloat2 logk2f(vfloat2 d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xasinhf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xasinhf(vfloat x) FUNC_ATTR { vfloat y = vabs_vf_vf(x); vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1)); vfloat2 d; @@ -2578,7 +2603,7 @@ EXPORT CONST VECTOR_CC vfloat xasinhf(vfloat x) { return y; } -EXPORT CONST VECTOR_CC vfloat xacoshf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xacoshf(vfloat x) FUNC_ATTR { vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x)); vfloat y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); @@ -2588,13 +2613,12 @@ EXPORT CONST VECTOR_CC vfloat xacoshf(vfloat x) { y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y))); - y = vreinterpret_vf_vm(vor_vm_vo32_vm(vlt_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y))); - y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); + y = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(vlt_vo_vf_vf(x, vcast_vf_f(1.0f)), visnan_vo_vf(x)), vreinterpret_vm_vf(y))); return y; } -EXPORT CONST VECTOR_CC vfloat xatanhf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xatanhf(vfloat x) FUNC_ATTR { vfloat y = vabs_vf_vf(x); vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y)))); y = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(y, vcast_vf_f(1.0)), vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(SLEEF_INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5)))))); @@ -2608,7 +2632,7 @@ EXPORT CONST VECTOR_CC vfloat xatanhf(vfloat x) { #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xexp2f(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xexp2f(vfloat d) FUNC_ATTR { vfloat u = vrint_vf_vf(d), s; vint2 q = vrint_vi2_vf(u); @@ -2635,7 +2659,7 @@ EXPORT CONST VECTOR_CC vfloat xexp2f(vfloat d) { return u; } -EXPORT CONST VECTOR_CC vfloat xexp2f_u35(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xexp2f_u35(vfloat d) FUNC_ATTR { vfloat u = vrint_vf_vf(d), s; vint2 q = vrint_vi2_vf(u); @@ -2657,7 +2681,7 @@ EXPORT CONST VECTOR_CC vfloat xexp2f_u35(vfloat d) { return u; } -EXPORT CONST VECTOR_CC vfloat xexp10f(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xexp10f(vfloat d) FUNC_ATTR { vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s; vint2 q = vrint_vi2_vf(u); @@ -2681,7 +2705,7 @@ EXPORT CONST VECTOR_CC vfloat xexp10f(vfloat d) { return u; } -EXPORT CONST VECTOR_CC vfloat xexp10f_u35(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xexp10f_u35(vfloat d) FUNC_ATTR { vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s; vint2 q = vrint_vi2_vf(u); @@ -2704,7 +2728,7 @@ EXPORT CONST VECTOR_CC vfloat xexp10f_u35(vfloat d) { return u; } -EXPORT CONST VECTOR_CC vfloat xexpm1f(vfloat a) { +EXPORT CONST VECTOR_CC vfloat xexpm1f(vfloat a) FUNC_ATTR { vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0)); vfloat x = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283172607421875f)), vcast_vf_f(SLEEF_INFINITYf), x); @@ -2715,7 +2739,7 @@ EXPORT CONST VECTOR_CC vfloat xexpm1f(vfloat a) { #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xlog10f(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xlog10f(vfloat d) FUNC_ATTR { vfloat2 x; vfloat t, m, x2; @@ -2760,7 +2784,7 @@ EXPORT CONST VECTOR_CC vfloat xlog10f(vfloat d) { return r; } -EXPORT CONST VECTOR_CC vfloat xlog2f(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xlog2f(vfloat d) FUNC_ATTR { vfloat2 x; vfloat t, m, x2; @@ -2806,7 +2830,7 @@ EXPORT CONST VECTOR_CC vfloat xlog2f(vfloat d) { return r; } -EXPORT CONST VECTOR_CC vfloat xlog2f_u35(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xlog2f_u35(vfloat d) FUNC_ATTR { vfloat m, t, x, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) @@ -2845,7 +2869,7 @@ EXPORT CONST VECTOR_CC vfloat xlog2f_u35(vfloat d) { return r; } -EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) FUNC_ATTR { vfloat2 x; vfloat t, m, x2; @@ -2881,7 +2905,12 @@ EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) { // Use log(d) if d too large to use core approximation. vopmask ocore = vle_vo_vf_vf(d, vcast_vf_f(LOG1PF_BOUND)); +#if defined(ENABLE_SVESTREAM) + r = vsel_vf_vo_vf_vf(ocore, r, xlogf_u1(d)); +#else if(!LIKELY(vtestallones_i_vo32 (ocore))) r = vsel_vf_vo_vf_vf(ocore, r, xlogf_u1(d)); +#endif + r = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), d), vreinterpret_vm_vf(r))); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(-1)), vcast_vf_f(-SLEEF_INFINITYf), r); r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), r); @@ -2893,11 +2922,11 @@ EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) { // #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xfabsf(vfloat x) { return vabs_vf_vf(x); } +EXPORT CONST VECTOR_CC vfloat xfabsf(vfloat x) FUNC_ATTR { return vabs_vf_vf(x); } -EXPORT CONST VECTOR_CC vfloat xcopysignf(vfloat x, vfloat y) { return vcopysign_vf_vf_vf(x, y); } +EXPORT CONST VECTOR_CC vfloat xcopysignf(vfloat x, vfloat y) FUNC_ATTR { return vcopysign_vf_vf_vf(x, y); } -EXPORT CONST VECTOR_CC vfloat xfmaxf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xfmaxf(vfloat x, vfloat y) FUNC_ATTR { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmax_vf_vf_vf(x, y)); #else @@ -2905,7 +2934,7 @@ EXPORT CONST VECTOR_CC vfloat xfmaxf(vfloat x, vfloat y) { #endif } -EXPORT CONST VECTOR_CC vfloat xfminf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xfminf(vfloat x, vfloat y) FUNC_ATTR { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmin_vf_vf_vf(x, y)); #else @@ -2913,13 +2942,13 @@ EXPORT CONST VECTOR_CC vfloat xfminf(vfloat x, vfloat y) { #endif } -EXPORT CONST VECTOR_CC vfloat xfdimf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xfdimf(vfloat x, vfloat y) FUNC_ATTR { vfloat ret = vsub_vf_vf_vf(x, y); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(ret, vcast_vf_f(0)), veq_vo_vf_vf(x, y)), vcast_vf_f(0), ret); return ret; } -EXPORT CONST VECTOR_CC vfloat xtruncf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xtruncf(vfloat x) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vtruncate_vf_vf(x); #else @@ -2928,19 +2957,19 @@ EXPORT CONST VECTOR_CC vfloat xtruncf(vfloat x) { #endif } -EXPORT CONST VECTOR_CC vfloat xfloorf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xfloorf(vfloat x) FUNC_ATTR { vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr); return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x)); } -EXPORT CONST VECTOR_CC vfloat xceilf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xceilf(vfloat x) FUNC_ATTR { vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); fr = vsel_vf_vo_vf_vf(vle_vo_vf_vf(fr, vcast_vf_f(0)), fr, vsub_vf_vf_vf(fr, vcast_vf_f(1.0f))); return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x)); } -EXPORT CONST VECTOR_CC vfloat xroundf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xroundf(vfloat d) FUNC_ATTR { vfloat x = vadd_vf_vf_vf(d, vcast_vf_f(0.5f)); vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); x = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vle_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(fr, vcast_vf_f(0))), vsub_vf_vf_vf(x, vcast_vf_f(1.0f)), x); @@ -2949,7 +2978,7 @@ EXPORT CONST VECTOR_CC vfloat xroundf(vfloat d) { return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(d), vge_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INT64_C(1) << 23))), d, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), d)); } -EXPORT CONST VECTOR_CC vfloat xrintf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xrintf(vfloat d) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vrint_vf_vf(d); #else @@ -2959,7 +2988,7 @@ EXPORT CONST VECTOR_CC vfloat xrintf(vfloat d) { #endif } -EXPORT CONST VECTOR_CC vfloat xfmaf(vfloat x, vfloat y, vfloat z) { +EXPORT CONST VECTOR_CC vfloat xfmaf(vfloat x, vfloat y, vfloat z) FUNC_ATTR { #ifdef ENABLE_FMA_SP return vfma_vf_vf_vf_vf(x, y, z); #else @@ -2997,7 +3026,7 @@ EXPORT CONST VECTOR_CC vfloat xfmaf(vfloat x, vfloat y, vfloat z) { } #endif // #if !defined(DETERMINISTIC) -SQRTFU05_FUNCATR VECTOR_CC vfloat xsqrtf_u05(vfloat d) { +SQRTFU05_FUNCATR VECTOR_CC vfloat xsqrtf_u05(vfloat d) FUNC_ATTR { #if defined(ENABLE_FMA_SP) vfloat q, w, x, y, z; @@ -3064,7 +3093,7 @@ SQRTFU05_FUNCATR VECTOR_CC vfloat xsqrtf_u05(vfloat d) { #endif } -EXPORT CONST VECTOR_CC vfloat xsqrtf(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xsqrtf(vfloat d) FUNC_ATTR { #ifdef ACCURATE_SQRT return vsqrt_vf_vf(d); #else @@ -3074,7 +3103,7 @@ EXPORT CONST VECTOR_CC vfloat xsqrtf(vfloat d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xhypotf_u05(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xhypotf_u05(vfloat x, vfloat y) FUNC_ATTR { x = vabs_vf_vf(x); y = vabs_vf_vf(y); vfloat min = vmin_vf_vf_vf(x, y), n = min; @@ -3095,7 +3124,7 @@ EXPORT CONST VECTOR_CC vfloat xhypotf_u05(vfloat x, vfloat y) { return ret; } -EXPORT CONST VECTOR_CC vfloat xhypotf_u35(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xhypotf_u35(vfloat x, vfloat y) FUNC_ATTR { x = vabs_vf_vf(x); y = vabs_vf_vf(y); vfloat min = vmin_vf_vf_vf(x, y); @@ -3110,7 +3139,7 @@ EXPORT CONST VECTOR_CC vfloat xhypotf_u35(vfloat x, vfloat y) { return ret; } -EXPORT CONST VECTOR_CC vfloat xnextafterf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xnextafterf(vfloat x, vfloat y) FUNC_ATTR { x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), y), x); vint2 xi2 = vreinterpret_vi2_vf(x); vopmask c = vxor_vo_vo_vo(vsignbit_vo_vf(x), vge_vo_vf_vf(y, x)); @@ -3133,7 +3162,7 @@ EXPORT CONST VECTOR_CC vfloat xnextafterf(vfloat x, vfloat y) { return ret; } -EXPORT CONST VECTOR_CC vfloat xfrfrexpf(vfloat x) { +EXPORT CONST VECTOR_CC vfloat xfrfrexpf(vfloat x) FUNC_ATTR { x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SLEEF_FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 30)), x); vmask xm = vreinterpret_vm_vf(x); @@ -3149,7 +3178,7 @@ EXPORT CONST VECTOR_CC vfloat xfrfrexpf(vfloat x) { } #endif // #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vint2 xexpfrexpf(vfloat x) { +EXPORT CONST VECTOR_CC vint2 xexpfrexpf(vfloat x) FUNC_ATTR { /* x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SLEEF_FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 63)), x); @@ -3163,12 +3192,12 @@ EXPORT CONST VECTOR_CC vint2 xexpfrexpf(vfloat x) { return vcast_vi2_i(0); } -static INLINE CONST VECTOR_CC vfloat vtoward0_vf_vf(vfloat x) { +static INLINE CONST VECTOR_CC vfloat vtoward0_vf_vf(vfloat x) FUNC_ATTR { vfloat t = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vreinterpret_vi2_vf(x), vcast_vi2_i(1))); return vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), t); } -static INLINE CONST VECTOR_CC vfloat vptrunc_vf_vf(vfloat x) { +static INLINE CONST VECTOR_CC vfloat vptrunc_vf_vf(vfloat x) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vtruncate_vf_vf(x); #else @@ -3178,7 +3207,7 @@ static INLINE CONST VECTOR_CC vfloat vptrunc_vf_vf(vfloat x) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) FUNC_ATTR { vfloat nu = vabs_vf_vf(x), de = vabs_vf_vf(y), s = vcast_vf_f(1), q; vopmask o = vlt_vo_vf_vf(de, vcast_vf_f(SLEEF_FLT_MIN)); nu = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(nu, vcast_vf_f(UINT64_C(1) << 25)), nu); @@ -3213,7 +3242,7 @@ EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) { return ret; } -static INLINE CONST VECTOR_CC vfloat vrintfk2_vf_vf(vfloat d) { +static INLINE CONST VECTOR_CC vfloat vrintfk2_vf_vf(vfloat d) FUNC_ATTR { #ifdef FULL_FP_ROUNDING return vrint_vf_vf(d); #else @@ -3223,7 +3252,7 @@ static INLINE CONST VECTOR_CC vfloat vrintfk2_vf_vf(vfloat d) { #endif } -EXPORT CONST VECTOR_CC vfloat xremainderf(vfloat x, vfloat y) { +EXPORT CONST VECTOR_CC vfloat xremainderf(vfloat x, vfloat y) FUNC_ATTR { vfloat n = vabs_vf_vf(x), d = vabs_vf_vf(y), s = vcast_vf_f(1), q; vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(SLEEF_FLT_MIN*2)); n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 25)), n); @@ -3256,7 +3285,7 @@ EXPORT CONST VECTOR_CC vfloat xremainderf(vfloat x, vfloat y) { // -static INLINE CONST VECTOR_CC vfloat2 sinpifk(vfloat d) { +static INLINE CONST VECTOR_CC vfloat2 sinpifk(vfloat d) FUNC_ATTR { vopmask o; vfloat u, s, t; vfloat2 x, s2; @@ -3294,7 +3323,7 @@ static INLINE CONST VECTOR_CC vfloat2 sinpifk(vfloat d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xsinpif_u05(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xsinpif_u05(vfloat d) FUNC_ATTR { vfloat2 x = sinpifk(d); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); @@ -3306,7 +3335,7 @@ EXPORT CONST VECTOR_CC vfloat xsinpif_u05(vfloat d) { } #endif // #if !defined(DETERMINISTIC) -static INLINE CONST VECTOR_CC vfloat2 cospifk(vfloat d) { +static INLINE CONST VECTOR_CC vfloat2 cospifk(vfloat d) FUNC_ATTR { vopmask o; vfloat u, s, t; vfloat2 x, s2; @@ -3344,7 +3373,7 @@ static INLINE CONST VECTOR_CC vfloat2 cospifk(vfloat d) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xcospif_u05(vfloat d) { +EXPORT CONST VECTOR_CC vfloat xcospif_u05(vfloat d) FUNC_ATTR { vfloat2 x = cospifk(d); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); @@ -3355,21 +3384,21 @@ EXPORT CONST VECTOR_CC vfloat xcospif_u05(vfloat d) { } #endif // #if !defined(DETERMINISTIC) -#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) +#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_SVESTREAM) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA)) typedef struct { vfloat2 a, b; } df2; -static df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) { +static df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) FUNC_ATTR { df2 r = { a, b }; return r; } -static vfloat2 df2geta_vf2_df2(df2 d) { return d.a; } -static vfloat2 df2getb_vf2_df2(df2 d) { return d.b; } +static vfloat2 df2geta_vf2_df2(df2 d) FUNC_ATTR { return d.a; } +static vfloat2 df2getb_vf2_df2(df2 d) FUNC_ATTR { return d.b; } #endif /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ -static CONST df2 gammafk(vfloat a) { +static CONST df2 gammafk(vfloat a) FUNC_ATTR { vfloat2 clc = vcast_vf2_f_f(0, 0), clln = vcast_vf2_f_f(1, 0), clld = vcast_vf2_f_f(1, 0); vfloat2 x, y, z; vfloat t, u; @@ -3433,7 +3462,7 @@ static CONST df2 gammafk(vfloat a) { } #if !defined(DETERMINISTIC) -EXPORT CONST VECTOR_CC vfloat xtgammaf_u1(vfloat a) { +EXPORT CONST VECTOR_CC vfloat xtgammaf_u1(vfloat a) FUNC_ATTR { df2 d = gammafk(a); vfloat2 y = dfmul_vf2_vf2_vf2(expk2f(df2geta_vf2_df2(d)), df2getb_vf2_df2(d)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)); @@ -3452,7 +3481,7 @@ EXPORT CONST VECTOR_CC vfloat xtgammaf_u1(vfloat a) { return r; } -EXPORT CONST VECTOR_CC vfloat xlgammaf_u1(vfloat a) { +EXPORT CONST VECTOR_CC vfloat xlgammaf_u1(vfloat a) FUNC_ATTR { df2 d = gammafk(a); vfloat2 y = dfadd2_vf2_vf2_vf2(df2geta_vf2_df2(d), logk2f(dfabs_vf2_vf2(df2getb_vf2_df2(d)))); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)); @@ -3466,17 +3495,17 @@ EXPORT CONST VECTOR_CC vfloat xlgammaf_u1(vfloat a) { return r; } -static INLINE CONST vfloat2 dfmla_vf2_vf_vf2_vf2(vfloat x, vfloat2 y, vfloat2 z) { +static INLINE CONST vfloat2 dfmla_vf2_vf_vf2_vf2(vfloat x, vfloat2 y, vfloat2 z) FUNC_ATTR { return dfadd_vf2_vf2_vf2(z, dfmul_vf2_vf2_vf(y, x)); } -static INLINE CONST vfloat2 poly2df_b(vfloat x, vfloat2 c1, vfloat2 c0) { return dfmla_vf2_vf_vf2_vf2(x, c1, c0); } -static INLINE CONST vfloat2 poly2df(vfloat x, vfloat c1, vfloat2 c0) { return dfmla_vf2_vf_vf2_vf2(x, vcast_vf2_vf_vf(c1, vcast_vf_f(0)), c0); } -static INLINE CONST vfloat2 poly4df(vfloat x, vfloat c3, vfloat2 c2, vfloat2 c1, vfloat2 c0) { +static INLINE CONST vfloat2 poly2df_b(vfloat x, vfloat2 c1, vfloat2 c0) FUNC_ATTR { return dfmla_vf2_vf_vf2_vf2(x, c1, c0); } +static INLINE CONST vfloat2 poly2df(vfloat x, vfloat c1, vfloat2 c0) FUNC_ATTR { return dfmla_vf2_vf_vf2_vf2(x, vcast_vf2_vf_vf(c1, vcast_vf_f(0)), c0); } +static INLINE CONST vfloat2 poly4df(vfloat x, vfloat c3, vfloat2 c2, vfloat2 c1, vfloat2 c0) FUNC_ATTR { return dfmla_vf2_vf_vf2_vf2(vmul_vf_vf_vf(x, x), poly2df(x, c3, c2), poly2df_b(x, c1, c0)); } -EXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) { +EXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) FUNC_ATTR { vfloat t, x = vabs_vf_vf(a); vfloat2 t2; vfloat x2 = vmul_vf_vf_vf(x, x), x4 = vmul_vf_vf_vf(x2, x2); @@ -3532,8 +3561,7 @@ EXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) { t2 = vsel_vf2_vo_vf2_vf2(vlt_vo_vf_vf(x, vcast_vf_f(1e-4)), dfmul_vf2_vf2_vf(vcast_vf2_f_f(-1.1283792257308959961, 5.8635383422197591097e-08), x), t2); vfloat z = vneg_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(t2), vf2gety_vf_vf2(t2))); - z = vsel_vf_vo_vf_vf(vge_vo_vf_vf(x, vcast_vf_f(6)), vcast_vf_f(1), z); - z = vsel_vf_vo_vf_vf(visinf_vo_vf(a), vcast_vf_f(1), z); + z = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vge_vo_vf_vf(x, vcast_vf_f(6)), visinf_vo_vf(a)), vcast_vf_f(1), z); z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(a, vcast_vf_f(0)), vcast_vf_f(0), z); z = vmulsign_vf_vf_vf(z, a); @@ -3541,7 +3569,7 @@ EXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) { } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ -EXPORT CONST VECTOR_CC vfloat xerfcf_u15(vfloat a) { +EXPORT CONST VECTOR_CC vfloat xerfcf_u15(vfloat a) FUNC_ATTR { vfloat s = a, r = vcast_vf_f(0), t; vfloat2 u, d, x; a = vabs_vf_vf(a); @@ -3587,10 +3615,10 @@ EXPORT CONST VECTOR_CC vfloat xerfcf_u15(vfloat a) { #define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) ))); #else -#define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat d) { return x ## FUNC (d); } -#define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat d) { return x ## FUNC (d); } -#define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y) { return x ## FUNC (x, y); } -#define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y, vfloat z) { return x ## FUNC (x, y, z); } +#define DALIAS_vf_vf(FNAME) EXPORT CONST VECTOR_CC vfloat y ## FNAME(vfloat d) FUNC_ATTR { return x ## FNAME (d); } +#define DALIAS_vf2_vf(FNAME) EXPORT CONST VECTOR_CC vfloat2 y ## FNAME(vfloat d) FUNC_ATTR { return x ## FNAME (d); } +#define DALIAS_vf_vf_vf(FNAME) EXPORT CONST VECTOR_CC vfloat y ## FNAME(vfloat x, vfloat y) FUNC_ATTR { return x ## FNAME (x, y); } +#define DALIAS_vf_vf_vf_vf(FNAME) EXPORT CONST VECTOR_CC vfloat y ## FNAME(vfloat x, vfloat y, vfloat z) FUNC_ATTR { return x ## FNAME (x, y, z); } #endif DALIAS_vf2_vf(sincospif_u05) @@ -3655,12 +3683,12 @@ DALIAS_vf_vf_vf(fastpowf_u3500) #endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) #if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) -EXPORT CONST int xgetIntf(int name) { +EXPORT CONST int xgetIntf(int name) FUNC_ATTR { if (1 <= name && name <= 10) return vavailability_i(name); return 0; } -EXPORT CONST void *xgetPtrf(int name) { +EXPORT CONST void *xgetPtrf(int name) FUNC_ATTR { if (name == 0) return ISANAME; return (void *)0; }